Blob


1 typedef struct Config Config;
2 typedef struct AMap AMap;
3 typedef struct AMapN AMapN;
4 typedef struct Arena Arena;
5 typedef struct AState AState;
6 typedef struct ArenaCIG ArenaCIG;
7 typedef struct ArenaHead ArenaHead;
8 typedef struct ArenaPart ArenaPart;
9 typedef struct ArenaTail ArenaTail;
10 typedef struct ATailStats ATailStats;
11 typedef struct CIBlock CIBlock;
12 typedef struct Clump Clump;
13 typedef struct ClumpInfo ClumpInfo;
14 typedef struct Graph Graph;
15 typedef struct IAddr IAddr;
16 typedef struct IBucket IBucket;
17 typedef struct IEStream IEStream;
18 typedef struct IEntry IEntry;
19 typedef struct IFile IFile;
20 typedef struct ISect ISect;
21 typedef struct Index Index;
22 typedef struct Lump Lump;
23 typedef struct DBlock DBlock;
24 typedef struct Part Part;
25 typedef struct Statbin Statbin;
26 typedef struct Statdesc Statdesc;
27 typedef struct Stats Stats;
28 typedef struct ZBlock ZBlock;
29 typedef struct Round Round;
30 typedef struct Bloom Bloom;
32 #pragma incomplete IEStream
34 #define TWID32 ((u32int)~(u32int)0)
35 #define TWID64 ((u64int)~(u64int)0)
36 #define TWID8 ((u8int)~(u8int)0)
38 enum
39 {
40 /*
41 * formerly fundamental constant,
42 * now a server-imposed limitation.
43 */
44 VtMaxLumpSize = 56*1024,
46 ABlockLog = 9, /* log2(512), the quantum for reading arenas */
47 ANameSize = 64,
48 MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
49 MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
50 PartBlank = 256*1024, /* untouched section at beginning of partition */
51 HeadSize = 512, /* size of a header after PartBlank */
52 MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
53 IndexBase = 1024*1024, /* initial address to use in an index */
54 MaxIo = 64*1024, /* max size of a single read or write operation */
55 ICacheBits = 16, /* default bits for indexing icache */
56 MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
57 Unspecified = TWID32,
59 /*
60 * return codes from syncarena
61 */
62 SyncDataErr = 1 << 0, /* problem reading the clump data */
63 SyncCIErr = 1 << 1, /* found erroneous clump directory entries */
64 SyncCIZero = 1 << 2, /* found unwritten clump directory entries */
65 SyncFixErr = 1 << 3, /* error writing fixed data */
66 SyncHeader = 1 << 4, /* altered header fields */
68 /*
69 * error severity
70 */
71 EOk = 0, /* error expected in normal operation */
72 EStrange, /* strange error that should be logged */
73 ECorrupt, /* corrupted data found in arenas */
74 EICorrupt, /* corrupted data found in index */
75 EAdmin, /* should be brought to administrators' attention */
76 ECrash, /* really bad internal error */
77 EBug, /* a limitation which should be fixed */
78 EInconsist, /* inconsistencies between index and arena */
79 EMax,
81 /*
82 * internal disk formats for the venti archival storage system
83 */
84 /*
85 * magic numbers on disk
86 */
87 _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */
88 ClumpFreeMagic = 0, /* free clump; terminates active clump log */
90 ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */
91 ArenaMagic = 0xf2a14eadU, /* arena trailer */
92 ArenaHeadMagic = 0xd15c4eadU, /* arena header */
94 BloomMagic = 0xb1004eadU, /* bloom filter header */
95 BloomMaxHash = 32,
97 ISectMagic = 0xd15c5ec7U, /* index header */
99 ArenaPartVersion = 3,
100 ArenaVersion4 = 4,
101 ArenaVersion5 = 5,
102 BloomVersion = 1,
103 IndexVersion = 1,
104 ISectVersion1 = 1,
105 ISectVersion2 = 2,
107 /*
108 * encodings of clumps on disk
109 */
110 ClumpEErr = 0, /* can't happen */
111 ClumpENone, /* plain */
112 ClumpECompress, /* compressed */
113 ClumpEMax,
115 /*
116 * sizes in bytes on disk
117 */
118 U8Size = 1,
119 U16Size = 2,
120 U32Size = 4,
121 U64Size = 8,
123 ArenaPartSize = 4 * U32Size,
124 ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
125 ArenaSize5 = ArenaSize4 + U32Size,
126 ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
127 ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize,
128 ArenaHeadSize5 = ArenaHeadSize4 + U32Size,
129 BloomHeadSize = 4 * U32Size,
130 ISectSize1 = 7 * U32Size + 2 * ANameSize,
131 ISectSize2 = ISectSize1 + U32Size,
132 ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize,
133 ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size,
134 MaxBloomSize = 1<<(32-3), /* 2^32 bits */
135 MaxBloomHash = 32, /* bits per score */
136 /*
137 * BUG - The various block copies that manipulate entry buckets
138 * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40,
139 * so that everything is word-aligned. Buildindex is actually cpu-bound
140 * by the (byte at a time) copying in qsort.
141 */
142 IBucketSize = U32Size + U16Size,
143 IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
144 IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size,
145 IEntryAddrOff = VtScoreSize + U32Size + U16Size,
147 MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
149 IcacheFrac = 1000000, /* denominator */
151 SleepForever = 1000000000, /* magic value for sleep time */
152 /*
153 * dirty flags - order controls disk write order
154 */
155 DirtyArena = 1,
156 DirtyArenaCib,
157 DirtyArenaTrailer,
158 DirtyMax,
160 ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry.
162 VentiZZZZZZZZ
163 };
165 extern char TraceDisk[];
166 extern char TraceLump[];
167 extern char TraceBlock[];
168 extern char TraceProc[];
169 extern char TraceWork[];
170 extern char TraceQuiet[];
171 extern char TraceRpc[];
173 /*
174 * results of parsing and initializing a config file
175 */
176 struct Config
178 char *index; /* name of the index to initialize */
179 int naparts; /* arena partitions initialized */
180 ArenaPart **aparts;
181 int nsects; /* index sections initialized */
182 ISect **sects;
183 Bloom *bloom; /* bloom filter */
184 u32int bcmem;
185 u32int mem;
186 u32int icmem;
187 int queuewrites;
188 char* haddr;
189 char* vaddr;
190 char* webroot;
191 };
193 /*
194 * a Part is the low level interface to files or disks.
195 * there are two main types of partitions
196 * arena paritions, which some number of arenas, each in a sub-partition.
197 * index partition, which only have one subpartition.
198 */
199 struct Part
201 int fd; /* rock for accessing the disk */
202 int mode;
203 u64int offset;
204 u64int size; /* size of the partiton */
205 u32int blocksize; /* block size for reads and writes */
206 u32int fsblocksize; /* minimum file system block size */
207 char *name;
208 char *filename;
209 Channel *writechan; /* chan[dcache.nblock](DBlock*) */
210 };
212 /*
213 * a cached block from the partition
214 * yuck -- most of this is internal structure for the cache
215 * all other routines should only use data
216 */
217 struct DBlock
219 u8int *data;
221 Part *part; /* partition in which cached */
222 u64int addr; /* base address on the partition */
223 u32int size; /* amount of data available, not amount allocated; should go away */
224 u32int mode;
225 u32int dirty;
226 u32int dirtying;
227 DBlock *next; /* doubly linked hash chains */
228 DBlock *prev;
229 u32int heap; /* index in heap table */
230 u32int used; /* last reference times */
231 u32int used2;
232 u32int ref; /* reference count */
233 RWLock lock; /* for access to data only */
234 Channel *writedonechan;
235 void* chanbuf[1]; /* buffer for the chan! */
236 };
238 /*
239 * a cached block from the partition
240 * yuck -- most of this is internal structure for the cache
241 * all other routines should only use data
242 * double yuck -- this is mostly the same as a DBlock
243 */
244 struct Lump
246 Packet *data;
248 Part *part; /* partition in which cached */
249 u8int score[VtScoreSize]; /* score of packet */
250 u8int type; /* type of packet */
251 u32int size; /* amount of data allocated to hold packet */
252 Lump *next; /* doubly linked hash chains */
253 Lump *prev;
254 u32int heap; /* index in heap table */
255 u32int used; /* last reference times */
256 u32int used2;
257 u32int ref; /* reference count */
258 QLock lock; /* for access to data only */
259 };
261 /*
262 * mapping between names and address ranges
263 */
264 struct AMap
266 u64int start;
267 u64int stop;
268 char name[ANameSize];
269 };
271 /*
272 * an AMap along with a length
273 */
274 struct AMapN
276 int n;
277 AMap *map;
278 };
280 /*
281 * an ArenaPart is a partition made up of Arenas
282 * it exists because most os's don't support many partitions,
283 * and we want to have many different Arenas
284 */
285 struct ArenaPart
287 Part *part;
288 u64int size; /* size of underlying partition, rounded down to blocks */
289 Arena **arenas;
290 u32int tabbase; /* base address of arena table on disk */
291 u32int tabsize; /* max. bytes in arena table */
293 /*
294 * fields stored on disk
295 */
296 u32int version;
297 u32int blocksize; /* "optimal" block size for reads and writes */
298 u32int arenabase; /* base address of first arena */
300 /*
301 * stored in the arena mapping table on disk
302 */
303 AMap *map;
304 int narenas;
305 };
307 /*
308 * info about one block in the clump info cache
309 */
310 struct CIBlock
312 u32int block; /* blocks in the directory */
313 int offset; /* offsets of one clump in the data */
314 DBlock *data;
315 };
317 /*
318 * Statistics kept in the tail.
319 */
320 struct ATailStats
322 u32int clumps; /* number of clumps */
323 u32int cclumps; /* number of compressed clumps */
324 u64int used;
325 u64int uncsize;
326 u8int sealed;
327 };
329 /*
330 * Arena state - represents a point in the data log
331 */
332 struct AState
334 Arena *arena;
335 u64int aa; /* index address */
336 ATailStats stats;
337 };
339 /*
340 * an Arena is a log of Clumps, preceeded by an ArenaHeader,
341 * and followed by a Arena, each in one disk block.
342 * struct on disk is not always up to date, but should be self-consistent.
343 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
344 * <struct name="Arena" type="Arena *">
345 * <field name="name" val="s->name" type="AName"/>
346 * <field name="version" val="s->version" type="U32int"/>
347 * <field name="partition" val="s->part->name" type="AName"/>
348 * <field name="blocksize" val="s->blocksize" type="U32int"/>
349 * <field name="start" val="s->base" type="U64int"/>
350 * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
351 * <field name="created" val="s->ctime" type="U32int"/>
352 * <field name="modified" val="s->wtime" type="U32int"/>
353 * <field name="sealed" val="s->sealed" type="Sealed"/>
354 * <field name="score" val="s->score" type="Score"/>
355 * <field name="clumps" val="s->clumps" type="U32int"/>
356 * <field name="compressedclumps" val="s->cclumps" type="U32int"/>
357 * <field name="data" val="s->uncsize" type="U64int"/>
358 * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
359 * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
360 * </struct>
361 */
362 struct Arena
364 QLock lock; /* lock for arena fields, writing to disk */
365 Part *part; /* partition in which arena lives */
366 int blocksize; /* size of block to read or write */
367 u64int base; /* base address on disk */
368 u64int size; /* total space in the arena */
369 u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
371 int clumpmax; /* ClumpInfos per block */
372 AState mem;
373 int inqueue;
375 /*
376 * fields stored on disk
377 */
378 u32int version;
379 char name[ANameSize]; /* text label */
380 ATailStats memstats;
381 ATailStats diskstats;
382 u32int ctime; /* first time a block was written */
383 u32int wtime; /* last time a block was written */
384 u32int clumpmagic;
386 ArenaCIG *cig;
387 int ncig;
388 };
390 struct ArenaCIG
392 u64int offset; // from arena base
393 };
395 /*
396 * redundant storage of some fields at the beginning of each arena
397 */
398 struct ArenaHead
400 u32int version;
401 char name[ANameSize];
402 u32int blocksize;
403 u64int size;
404 u32int clumpmagic;
405 };
407 /*
408 * most interesting meta information for a clump.
409 * stored in each clump's header and in the Arena's directory,
410 * stored in reverse order just prior to the arena trailer
411 */
412 struct ClumpInfo
414 u8int type;
415 u16int size; /* size of disk data, not including header */
416 u16int uncsize; /* size of uncompressed data */
417 u8int score[VtScoreSize]; /* score of the uncompressed data only */
418 };
420 /*
421 * header for an immutable clump of data
422 */
423 struct Clump
425 ClumpInfo info;
426 u8int encoding;
427 u32int creator; /* initial client which wrote the block */
428 u32int time; /* creation at gmt seconds since 1/1/1970 */
429 };
431 /*
432 * index of all clumps according to their score
433 * this is just a wrapper to tie together the index sections
434 * <struct name="Index" type="Index *">
435 * <field name="name" val="s->name" type="AName"/>
436 * <field name="version" val="s->version" type="U32int"/>
437 * <field name="blocksize" val="s->blocksize" type="U32int"/>
438 * <field name="tabsize" val="s->tabsize" type="U32int"/>
439 * <field name="buckets" val="s->buckets" type="U32int"/>
440 * <field name="buckdiv" val="s->div" type="U32int"/>
441 * <field name="bitblocks" val="s->div" type="U32int"/>
442 * <field name="maxdepth" val="s->div" type="U32int"/>
443 * <field name="bitkeylog" val="s->div" type="U32int"/>
444 * <field name="bitkeymask" val="s->div" type="U32int"/>
445 * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
446 * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
447 * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
448 * </struct>
449 * <struct name="Amap" type="AMap *">
450 * <field name="name" val="s->name" type="AName"/>
451 * <field name="start" val="s->start" type="U64int"/>
452 * <field name="stop" val="s->stop" type="U64int"/>
453 * </struct>
454 */
455 struct Index
457 u32int div; /* divisor for mapping score to bucket */
458 u32int buckets; /* last bucket used in disk hash table */
459 u32int blocksize;
460 u32int tabsize; /* max. bytes in index config */
462 int mapalloc; /* first arena to check when adding a lump */
463 Arena **arenas; /* arenas in the mapping */
464 ISect **sects; /* sections which hold the buckets */
465 Bloom *bloom; /* bloom filter */
467 /*
468 * fields stored in config file
469 */
470 u32int version;
471 char name[ANameSize]; /* text label */
472 int nsects;
473 AMap *smap; /* mapping of buckets to index sections */
474 int narenas;
475 AMap *amap; /* mapping from index addesses to arenas */
477 QLock writing;
478 };
480 /*
481 * one part of the bucket storage for an index.
482 * the index blocks are sequentially allocated
483 * across all of the sections.
484 */
485 struct ISect
487 Part *part;
488 int blocklog; /* log2(blocksize) */
489 int buckmax; /* max. entries in a index bucket */
490 u32int tabbase; /* base address of index config table on disk */
491 u32int tabsize; /* max. bytes in index config */
492 Channel *writechan;
493 Channel *writedonechan;
494 void *ig; /* used by buildindex only */
495 int ng;
497 /*
498 * fields stored on disk
499 */
500 u32int version;
501 u32int bucketmagic;
502 char name[ANameSize]; /* text label */
503 char index[ANameSize]; /* index owning the section */
504 u32int blocksize; /* size of hash buckets in index */
505 u32int blockbase; /* address of start of on disk index table */
506 u32int blocks; /* total blocks on disk; some may be unused */
507 u32int start; /* first bucket in this section */
508 u32int stop; /* limit of buckets in this section */
509 };
511 /*
512 * externally interesting part of an IEntry
513 */
514 struct IAddr
516 u64int addr;
517 u16int size; /* uncompressed size */
518 u8int type; /* type of block */
519 u8int blocks; /* arena io quanta for Clump + data */
520 };
522 /*
523 * entries in the index
524 * kept in IBuckets in the disk index table,
525 * cached in the memory ICache.
526 */
527 struct IEntry
529 /* on disk data - 32 bytes*/
530 u8int score[VtScoreSize];
531 IAddr ia;
533 IEntry *nexthash;
534 IEntry *nextdirty;
535 IEntry *next;
536 IEntry *prev;
537 u8int state;
538 };
539 enum {
540 IEClean = 0,
541 IEDirty = 1,
542 IESummary = 2,
543 };
545 /*
546 * buckets in the on disk index table
547 */
548 struct IBucket
550 u16int n; /* number of active indices */
551 u32int buck; /* used by buildindex/checkindex only */
552 u8int *data;
553 };
555 /*
556 * temporary buffers used by individual threads
557 */
558 struct ZBlock
560 u32int len;
561 u32int _size;
562 u8int *data;
563 u8int *free;
564 };
566 /*
567 * simple input buffer for a '\0' terminated text file
568 */
569 struct IFile
571 char *name; /* name of the file */
572 ZBlock *b; /* entire contents of file */
573 u32int pos; /* current position in the file */
574 };
576 struct Statdesc
578 char *name;
579 ulong max;
580 };
582 /* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/
583 enum
585 StatRpcTotal,
586 StatRpcRead,
587 StatRpcReadOk,
588 StatRpcReadFail,
589 StatRpcReadBytes,
590 StatRpcReadTime,
591 StatRpcReadCached,
592 StatRpcReadCachedTime,
593 StatRpcReadUncached,
594 StatRpcReadUncachedTime,
595 StatRpcWrite,
596 StatRpcWriteNew,
597 StatRpcWriteOld,
598 StatRpcWriteFail,
599 StatRpcWriteBytes,
600 StatRpcWriteTime,
601 StatRpcWriteNewTime,
602 StatRpcWriteOldTime,
604 StatLcacheHit,
605 StatLcacheMiss,
606 StatLcacheRead,
607 StatLcacheWrite,
608 StatLcacheSize,
609 StatLcacheStall,
610 StatLcacheReadTime,
612 StatDcacheHit,
613 StatDcacheMiss,
614 StatDcacheLookup,
615 StatDcacheRead,
616 StatDcacheWrite,
617 StatDcacheDirty,
618 StatDcacheSize,
619 StatDcacheFlush,
620 StatDcacheStall,
621 StatDcacheLookupTime,
623 StatDblockStall,
624 StatLumpStall,
626 StatIcacheHit,
627 StatIcacheMiss,
628 StatIcacheRead,
629 StatIcacheWrite,
630 StatIcacheFill,
631 StatIcachePrefetch,
632 StatIcacheDirty,
633 StatIcacheSize,
634 StatIcacheFlush,
635 StatIcacheStall,
636 StatIcacheReadTime,
637 StatIcacheLookup,
638 StatScacheHit,
639 StatScachePrefetch,
641 StatBloomHit,
642 StatBloomMiss,
643 StatBloomFalseMiss,
644 StatBloomLookup,
645 StatBloomOnes,
646 StatBloomBits,
648 StatApartRead,
649 StatApartReadBytes,
650 StatApartWrite,
651 StatApartWriteBytes,
653 StatIsectRead,
654 StatIsectReadBytes,
655 StatIsectWrite,
656 StatIsectWriteBytes,
658 StatSumRead,
659 StatSumReadBytes,
661 StatCigLoad,
662 StatCigLoadTime,
664 NStat
665 };
667 extern Statdesc statdesc[NStat];
669 /*
670 * statistics about the operation of the server
671 * mainly for performance monitoring and profiling.
672 */
673 struct Stats
675 ulong now;
676 ulong n[NStat];
677 };
679 struct Statbin
681 uint nsamp;
682 uint min;
683 uint max;
684 uint avg;
685 };
687 struct Graph
689 long (*fn)(Stats*, Stats*, void*);
690 void *arg;
691 long t0;
692 long t1;
693 long min;
694 long max;
695 long wid;
696 long ht;
697 int fill;
698 };
700 /*
701 * for kicking background processes that run one round after another after another
702 */
703 struct Round
705 QLock lock;
706 Rendez start;
707 Rendez finish;
708 Rendez delaywait;
709 int delaytime;
710 int delaykick;
711 char* name;
712 int last;
713 int current;
714 int next;
715 int doanother;
716 };
718 /*
719 * Bloom filter of stored block hashes
720 */
721 struct Bloom
723 RWLock lk; /* protects nhash, nbits, tab, mb */
724 QLock mod; /* one marker at a time, protects nb */
725 int nhash;
726 ulong size; /* bytes in tab */
727 ulong bitmask; /* to produce bit index */
728 u8int *data;
729 Part *part;
730 Channel *writechan;
731 Channel *writedonechan;
732 };
734 extern Index *mainindex;
735 extern u32int maxblocksize; /* max. block size used by any partition */
736 extern int paranoid; /* should verify hashes on disk read */
737 extern int queuewrites; /* put all lump writes on a queue and finish later */
738 extern int readonly; /* only allowed to read the disk data */
739 extern Stats stats;
740 extern u8int zeroscore[VtScoreSize];
741 extern int compressblocks;
742 extern int writestodevnull; /* dangerous - for performance debugging */
743 extern int bootstrap; /* writes but does not index - cannot read */
744 extern int collectstats;
745 extern QLock memdrawlock;
746 extern int icachesleeptime;
747 extern int minicachesleeptime;
748 extern int arenasumsleeptime;
749 extern int manualscheduling;
750 extern int l0quantum;
751 extern int l1quantum;
752 extern int ignorebloom;
753 extern int icacheprefetch;
754 extern int syncwrites;
755 extern int debugarena; /* print in arena error msgs; -1==unknown */
757 extern Stats *stathist;
758 extern int nstathist;
759 extern ulong stattime;
761 #ifndef PLAN9PORT
762 #pragma varargck type "V" uchar*
763 #define ODIRECT 0
764 #endif