1 typedef struct Config Config;
2 typedef struct AMap AMap;
3 typedef struct AMapN AMapN;
4 typedef struct Arena Arena;
5 typedef struct AState AState;
6 typedef struct ArenaCIG ArenaCIG;
7 typedef struct ArenaHead ArenaHead;
8 typedef struct ArenaPart ArenaPart;
9 typedef struct ArenaTail ArenaTail;
10 typedef struct ATailStats ATailStats;
11 typedef struct CIBlock CIBlock;
12 typedef struct Clump Clump;
13 typedef struct ClumpInfo ClumpInfo;
14 typedef struct Graph Graph;
15 typedef struct IAddr IAddr;
16 typedef struct IBucket IBucket;
17 typedef struct IEStream IEStream;
18 typedef struct IEntry IEntry;
19 typedef struct IFile IFile;
20 typedef struct ISect ISect;
21 typedef struct Index Index;
22 typedef struct Lump Lump;
23 typedef struct DBlock DBlock;
24 typedef struct Part Part;
25 typedef struct Statbin Statbin;
26 typedef struct Statdesc Statdesc;
27 typedef struct Stats Stats;
28 typedef struct ZBlock ZBlock;
29 typedef struct Round Round;
30 typedef struct Bloom Bloom;
32 #pragma incomplete IEStream
34 #define TWID32 ((u32int)~(u32int)0)
35 #define TWID64 ((u64int)~(u64int)0)
36 #define TWID8 ((u8int)~(u8int)0)
41 * formerly fundamental constant,
42 * now a server-imposed limitation.
44 VtMaxLumpSize = 56*1024,
46 ABlockLog = 9, /* log2(512), the quantum for reading arenas */
48 MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
49 MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
50 PartBlank = 256*1024, /* untouched section at beginning of partition */
51 HeadSize = 512, /* size of a header after PartBlank */
52 MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
53 IndexBase = 1024*1024, /* initial address to use in an index */
54 MaxIo = 64*1024, /* max size of a single read or write operation */
55 ICacheBits = 16, /* default bits for indexing icache */
56 MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
60 * return codes from syncarena
62 SyncDataErr = 1 << 0, /* problem reading the clump data */
63 SyncCIErr = 1 << 1, /* found erroneous clump directory entries */
64 SyncCIZero = 1 << 2, /* found unwritten clump directory entries */
65 SyncFixErr = 1 << 3, /* error writing fixed data */
66 SyncHeader = 1 << 4, /* altered header fields */
71 EOk = 0, /* error expected in normal operation */
72 EStrange, /* strange error that should be logged */
73 ECorrupt, /* corrupted data found in arenas */
74 EICorrupt, /* corrupted data found in index */
75 EAdmin, /* should be brought to administrators' attention */
76 ECrash, /* really bad internal error */
77 EBug, /* a limitation which should be fixed */
78 EInconsist, /* inconsistencies between index and arena */
82 * internal disk formats for the venti archival storage system
85 * magic numbers on disk
87 _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */
88 ClumpFreeMagic = 0, /* free clump; terminates active clump log */
90 ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */
91 ArenaMagic = 0xf2a14eadU, /* arena trailer */
92 ArenaHeadMagic = 0xd15c4eadU, /* arena header */
94 BloomMagic = 0xb1004eadU, /* bloom filter header */
97 ISectMagic = 0xd15c5ec7U, /* index header */
108 * encodings of clumps on disk
110 ClumpEErr = 0, /* can't happen */
111 ClumpENone, /* plain */
112 ClumpECompress, /* compressed */
116 * sizes in bytes on disk
123 ArenaPartSize = 4 * U32Size,
124 ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
125 ArenaSize5 = ArenaSize4 + U32Size,
126 ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
127 ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize,
128 ArenaHeadSize5 = ArenaHeadSize4 + U32Size,
129 BloomHeadSize = 4 * U32Size,
130 ISectSize1 = 7 * U32Size + 2 * ANameSize,
131 ISectSize2 = ISectSize1 + U32Size,
132 ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize,
133 ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size,
134 MaxBloomSize = 1<<(32-3), /* 2^32 bits */
135 MaxBloomHash = 32, /* bits per score */
137 * BUG - The various block copies that manipulate entry buckets
138 * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40,
139 * so that everything is word-aligned. Buildindex is actually cpu-bound
140 * by the (byte at a time) copying in qsort.
142 IBucketSize = U32Size + U16Size,
143 IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
144 IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size,
145 IEntryAddrOff = VtScoreSize + U32Size + U16Size,
147 MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
149 IcacheFrac = 1000000, /* denominator */
151 SleepForever = 1000000000, /* magic value for sleep time */
153 * dirty flags - order controls disk write order
160 ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry.
165 extern char TraceDisk[];
166 extern char TraceLump[];
167 extern char TraceBlock[];
168 extern char TraceProc[];
169 extern char TraceWork[];
170 extern char TraceQuiet[];
171 extern char TraceRpc[];
174 * results of parsing and initializing a config file
178 char *index; /* name of the index to initialize */
179 int naparts; /* arena partitions initialized */
181 int nsects; /* index sections initialized */
183 Bloom *bloom; /* bloom filter */
194 * a Part is the low level interface to files or disks.
195 * there are two main types of partitions
196 * arena paritions, which some number of arenas, each in a sub-partition.
197 * index partition, which only have one subpartition.
201 int fd; /* rock for accessing the disk */
204 u64int size; /* size of the partiton */
205 u32int blocksize; /* block size for reads and writes */
206 u32int fsblocksize; /* minimum file system block size */
209 Channel *writechan; /* chan[dcache.nblock](DBlock*) */
213 * a cached block from the partition
214 * yuck -- most of this is internal structure for the cache
215 * all other routines should only use data
221 Part *part; /* partition in which cached */
222 u64int addr; /* base address on the partition */
223 u32int size; /* amount of data available, not amount allocated; should go away */
227 DBlock *next; /* doubly linked hash chains */
229 u32int heap; /* index in heap table */
230 u32int used; /* last reference times */
232 u32int ref; /* reference count */
233 RWLock lock; /* for access to data only */
234 Channel *writedonechan;
235 void* chanbuf[1]; /* buffer for the chan! */
239 * a cached block from the partition
240 * yuck -- most of this is internal structure for the cache
241 * all other routines should only use data
242 * double yuck -- this is mostly the same as a DBlock
248 Part *part; /* partition in which cached */
249 u8int score[VtScoreSize]; /* score of packet */
250 u8int type; /* type of packet */
251 u32int size; /* amount of data allocated to hold packet */
252 Lump *next; /* doubly linked hash chains */
254 u32int heap; /* index in heap table */
255 u32int used; /* last reference times */
257 u32int ref; /* reference count */
258 QLock lock; /* for access to data only */
262 * mapping between names and address ranges
268 char name[ANameSize];
272 * an AMap along with a length
281 * an ArenaPart is a partition made up of Arenas
282 * it exists because most os's don't support many partitions,
283 * and we want to have many different Arenas
288 u64int size; /* size of underlying partition, rounded down to blocks */
290 u32int tabbase; /* base address of arena table on disk */
291 u32int tabsize; /* max. bytes in arena table */
294 * fields stored on disk
297 u32int blocksize; /* "optimal" block size for reads and writes */
298 u32int arenabase; /* base address of first arena */
301 * stored in the arena mapping table on disk
308 * info about one block in the clump info cache
312 u32int block; /* blocks in the directory */
313 int offset; /* offsets of one clump in the data */
318 * Statistics kept in the tail.
322 u32int clumps; /* number of clumps */
323 u32int cclumps; /* number of compressed clumps */
330 * Arena state - represents a point in the data log
335 u64int aa; /* index address */
340 * an Arena is a log of Clumps, preceeded by an ArenaHeader,
341 * and followed by a Arena, each in one disk block.
342 * struct on disk is not always up to date, but should be self-consistent.
343 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
344 * <struct name="Arena" type="Arena *">
345 * <field name="name" val="s->name" type="AName"/>
346 * <field name="version" val="s->version" type="U32int"/>
347 * <field name="partition" val="s->part->name" type="AName"/>
348 * <field name="blocksize" val="s->blocksize" type="U32int"/>
349 * <field name="start" val="s->base" type="U64int"/>
350 * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
351 * <field name="created" val="s->ctime" type="U32int"/>
352 * <field name="modified" val="s->wtime" type="U32int"/>
353 * <field name="sealed" val="s->sealed" type="Sealed"/>
354 * <field name="score" val="s->score" type="Score"/>
355 * <field name="clumps" val="s->clumps" type="U32int"/>
356 * <field name="compressedclumps" val="s->cclumps" type="U32int"/>
357 * <field name="data" val="s->uncsize" type="U64int"/>
358 * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
359 * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
364 QLock lock; /* lock for arena fields, writing to disk */
365 Part *part; /* partition in which arena lives */
366 int blocksize; /* size of block to read or write */
367 u64int base; /* base address on disk */
368 u64int size; /* total space in the arena */
369 u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
371 int clumpmax; /* ClumpInfos per block */
376 * fields stored on disk
379 char name[ANameSize]; /* text label */
381 ATailStats diskstats;
382 u32int ctime; /* first time a block was written */
383 u32int wtime; /* last time a block was written */
392 u64int offset; // from arena base
396 * redundant storage of some fields at the beginning of each arena
401 char name[ANameSize];
408 * most interesting meta information for a clump.
409 * stored in each clump's header and in the Arena's directory,
410 * stored in reverse order just prior to the arena trailer
415 u16int size; /* size of disk data, not including header */
416 u16int uncsize; /* size of uncompressed data */
417 u8int score[VtScoreSize]; /* score of the uncompressed data only */
421 * header for an immutable clump of data
427 u32int creator; /* initial client which wrote the block */
428 u32int time; /* creation at gmt seconds since 1/1/1970 */
432 * index of all clumps according to their score
433 * this is just a wrapper to tie together the index sections
434 * <struct name="Index" type="Index *">
435 * <field name="name" val="s->name" type="AName"/>
436 * <field name="version" val="s->version" type="U32int"/>
437 * <field name="blocksize" val="s->blocksize" type="U32int"/>
438 * <field name="tabsize" val="s->tabsize" type="U32int"/>
439 * <field name="buckets" val="s->buckets" type="U32int"/>
440 * <field name="buckdiv" val="s->div" type="U32int"/>
441 * <field name="bitblocks" val="s->div" type="U32int"/>
442 * <field name="maxdepth" val="s->div" type="U32int"/>
443 * <field name="bitkeylog" val="s->div" type="U32int"/>
444 * <field name="bitkeymask" val="s->div" type="U32int"/>
445 * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
446 * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
447 * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
449 * <struct name="Amap" type="AMap *">
450 * <field name="name" val="s->name" type="AName"/>
451 * <field name="start" val="s->start" type="U64int"/>
452 * <field name="stop" val="s->stop" type="U64int"/>
457 u32int div; /* divisor for mapping score to bucket */
458 u32int buckets; /* last bucket used in disk hash table */
460 u32int tabsize; /* max. bytes in index config */
462 int mapalloc; /* first arena to check when adding a lump */
463 Arena **arenas; /* arenas in the mapping */
464 ISect **sects; /* sections which hold the buckets */
465 Bloom *bloom; /* bloom filter */
468 * fields stored in config file
471 char name[ANameSize]; /* text label */
473 AMap *smap; /* mapping of buckets to index sections */
475 AMap *amap; /* mapping from index addesses to arenas */
481 * one part of the bucket storage for an index.
482 * the index blocks are sequentially allocated
483 * across all of the sections.
488 int blocklog; /* log2(blocksize) */
489 int buckmax; /* max. entries in a index bucket */
490 u32int tabbase; /* base address of index config table on disk */
491 u32int tabsize; /* max. bytes in index config */
493 Channel *writedonechan;
494 void *ig; /* used by buildindex only */
498 * fields stored on disk
502 char name[ANameSize]; /* text label */
503 char index[ANameSize]; /* index owning the section */
504 u32int blocksize; /* size of hash buckets in index */
505 u32int blockbase; /* address of start of on disk index table */
506 u32int blocks; /* total blocks on disk; some may be unused */
507 u32int start; /* first bucket in this section */
508 u32int stop; /* limit of buckets in this section */
512 * externally interesting part of an IEntry
517 u16int size; /* uncompressed size */
518 u8int type; /* type of block */
519 u8int blocks; /* arena io quanta for Clump + data */
523 * entries in the index
524 * kept in IBuckets in the disk index table,
525 * cached in the memory ICache.
529 /* on disk data - 32 bytes*/
530 u8int score[VtScoreSize];
546 * buckets in the on disk index table
550 u16int n; /* number of active indices */
551 u32int buck; /* used by buildindex/checkindex only */
556 * temporary buffers used by individual threads
567 * simple input buffer for a '\0' terminated text file
571 char *name; /* name of the file */
572 ZBlock *b; /* entire contents of file */
573 u32int pos; /* current position in the file */
582 /* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/
592 StatRpcReadCachedTime,
594 StatRpcReadUncachedTime,
621 StatDcacheLookupTime,
667 extern Statdesc statdesc[NStat];
670 * statistics about the operation of the server
671 * mainly for performance monitoring and profiling.
689 long (*fn)(Stats*, Stats*, void*);
701 * for kicking background processes that run one round after another after another
719 * Bloom filter of stored block hashes
723 RWLock lk; /* protects nhash, nbits, tab, mb */
724 QLock mod; /* one marker at a time, protects nb */
726 ulong size; /* bytes in tab */
727 ulong bitmask; /* to produce bit index */
731 Channel *writedonechan;
734 extern Index *mainindex;
735 extern u32int maxblocksize; /* max. block size used by any partition */
736 extern int paranoid; /* should verify hashes on disk read */
737 extern int queuewrites; /* put all lump writes on a queue and finish later */
738 extern int readonly; /* only allowed to read the disk data */
740 extern u8int zeroscore[VtScoreSize];
741 extern int compressblocks;
742 extern int writestodevnull; /* dangerous - for performance debugging */
743 extern int bootstrap; /* writes but does not index - cannot read */
744 extern int collectstats;
745 extern QLock memdrawlock;
746 extern int icachesleeptime;
747 extern int minicachesleeptime;
748 extern int arenasumsleeptime;
749 extern int manualscheduling;
750 extern int l0quantum;
751 extern int l1quantum;
752 extern int ignorebloom;
753 extern int icacheprefetch;
754 extern int syncwrites;
755 extern int debugarena; /* print in arena error msgs; -1==unknown */
757 extern Stats *stathist;
758 extern int nstathist;
759 extern ulong stattime;
762 #pragma varargck type "V" uchar*