Blob


1 typedef struct Config Config;
2 typedef struct AMap AMap;
3 typedef struct AMapN AMapN;
4 typedef struct Arena Arena;
5 typedef struct AState AState;
6 typedef struct ArenaCIG ArenaCIG;
7 typedef struct ArenaHead ArenaHead;
8 typedef struct ArenaPart ArenaPart;
9 typedef struct ArenaTail ArenaTail;
10 typedef struct ATailStats ATailStats;
11 typedef struct CIBlock CIBlock;
12 typedef struct Clump Clump;
13 typedef struct ClumpInfo ClumpInfo;
14 typedef struct Graph Graph;
15 typedef struct IAddr IAddr;
16 typedef struct IBucket IBucket;
17 typedef struct IEStream IEStream;
18 typedef struct IEntry IEntry;
19 typedef struct IFile IFile;
20 typedef struct ISect ISect;
21 typedef struct Index Index;
22 typedef struct Lump Lump;
23 typedef struct DBlock DBlock;
24 typedef struct Part Part;
25 typedef struct Statbin Statbin;
26 typedef struct Statdesc Statdesc;
27 typedef struct Stats Stats;
28 typedef struct ZBlock ZBlock;
29 typedef struct Round Round;
30 typedef struct Bloom Bloom;
32 #pragma incomplete IEStream
34 #define TWID32 ((u32int)~(u32int)0)
35 #define TWID64 ((u64int)~(u64int)0)
36 #define TWID8 ((u8int)~(u8int)0)
38 enum
39 {
40 /*
41 * formerly fundamental constant,
42 * now a server-imposed limitation.
43 */
44 VtMaxLumpSize = 56*1024,
46 ABlockLog = 9, /* log2(512), the quantum for reading arenas */
47 ANameSize = 64,
48 MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
49 MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
50 PartBlank = 256*1024, /* untouched section at beginning of partition */
51 HeadSize = 512, /* size of a header after PartBlank */
52 MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
53 IndexBase = 1024*1024, /* initial address to use in an index */
54 MaxIo = 64*1024, /* max size of a single read or write operation */
55 ICacheBits = 16, /* default bits for indexing icache */
56 MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
58 /*
59 * return codes from syncarena
60 */
61 SyncDataErr = 1 << 0, /* problem reading the clump data */
62 SyncCIErr = 1 << 1, /* found erroneous clump directory entries */
63 SyncCIZero = 1 << 2, /* found unwritten clump directory entries */
64 SyncFixErr = 1 << 3, /* error writing fixed data */
65 SyncHeader = 1 << 4, /* altered header fields */
67 /*
68 * error severity
69 */
70 EOk = 0, /* error expected in normal operation */
71 EStrange, /* strange error that should be logged */
72 ECorrupt, /* corrupted data found in arenas */
73 EICorrupt, /* corrupted data found in index */
74 EAdmin, /* should be brought to administrators' attention */
75 ECrash, /* really bad internal error */
76 EBug, /* a limitation which should be fixed */
77 EInconsist, /* inconsistencies between index and arena */
78 EMax,
80 /*
81 * internal disk formats for the venti archival storage system
82 */
83 /*
84 * magic numbers on disk
85 */
86 _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */
87 ClumpFreeMagic = 0, /* free clump; terminates active clump log */
89 ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */
90 ArenaMagic = 0xf2a14eadU, /* arena trailer */
91 ArenaHeadMagic = 0xd15c4eadU, /* arena header */
93 BloomMagic = 0xb1004eadU, /* bloom filter header */
94 BloomMaxHash = 32,
96 ISectMagic = 0xd15c5ec7U, /* index header */
98 ArenaPartVersion = 3,
99 ArenaVersion4 = 4,
100 ArenaVersion5 = 5,
101 BloomVersion = 1,
102 IndexVersion = 1,
103 ISectVersion1 = 1,
104 ISectVersion2 = 2,
106 /*
107 * encodings of clumps on disk
108 */
109 ClumpEErr = 0, /* can't happen */
110 ClumpENone, /* plain */
111 ClumpECompress, /* compressed */
112 ClumpEMax,
114 /*
115 * sizes in bytes on disk
116 */
117 U8Size = 1,
118 U16Size = 2,
119 U32Size = 4,
120 U64Size = 8,
122 ArenaPartSize = 4 * U32Size,
123 ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
124 ArenaSize5 = ArenaSize4 + U32Size,
125 ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
126 ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize,
127 ArenaHeadSize5 = ArenaHeadSize4 + U32Size,
128 BloomHeadSize = 4 * U32Size,
129 ISectSize1 = 7 * U32Size + 2 * ANameSize,
130 ISectSize2 = ISectSize1 + U32Size,
131 ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize,
132 ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size,
133 MaxBloomSize = 1<<(32-3), /* 2^32 bits */
134 MaxBloomHash = 32, /* bits per score */
135 /*
136 * BUG - The various block copies that manipulate entry buckets
137 * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40,
138 * so that everything is word-aligned. Buildindex is actually cpu-bound
139 * by the (byte at a time) copying in qsort.
140 */
141 IBucketSize = U32Size + U16Size,
142 IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
143 IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size,
144 IEntryAddrOff = VtScoreSize + U32Size + U16Size,
146 MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
148 IcacheFrac = 1000000, /* denominator */
150 SleepForever = 1000000000, /* magic value for sleep time */
151 /*
152 * dirty flags - order controls disk write order
153 */
154 DirtyArena = 1,
155 DirtyArenaCib,
156 DirtyArenaTrailer,
157 DirtyMax,
159 ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry.
161 VentiZZZZZZZZ
162 };
164 extern char TraceDisk[];
165 extern char TraceLump[];
166 extern char TraceBlock[];
167 extern char TraceProc[];
168 extern char TraceWork[];
169 extern char TraceQuiet[];
170 extern char TraceRpc[];
172 /*
173 * results of parsing and initializing a config file
174 */
175 struct Config
177 char *index; /* name of the index to initialize */
178 int naparts; /* arena partitions initialized */
179 ArenaPart **aparts;
180 int nsects; /* index sections initialized */
181 ISect **sects;
182 Bloom *bloom; /* bloom filter */
183 u32int bcmem;
184 u32int mem;
185 u32int icmem;
186 int queuewrites;
187 char* haddr;
188 char* vaddr;
189 char* webroot;
190 };
192 /*
193 * a Part is the low level interface to files or disks.
194 * there are two main types of partitions
195 * arena paritions, which some number of arenas, each in a sub-partition.
196 * index partition, which only have one subpartition.
197 */
198 struct Part
200 int fd; /* rock for accessing the disk */
201 int mode;
202 u64int offset;
203 u64int size; /* size of the partiton */
204 u32int blocksize; /* block size for reads and writes */
205 u32int fsblocksize; /* minimum file system block size */
206 char *name;
207 char *filename;
208 Channel *writechan; /* chan[dcache.nblock](DBlock*) */
209 };
211 /*
212 * a cached block from the partition
213 * yuck -- most of this is internal structure for the cache
214 * all other routines should only use data
215 */
216 struct DBlock
218 u8int *data;
220 Part *part; /* partition in which cached */
221 u64int addr; /* base address on the partition */
222 u32int size; /* amount of data available, not amount allocated; should go away */
223 u32int mode;
224 u32int dirty;
225 u32int dirtying;
226 DBlock *next; /* doubly linked hash chains */
227 DBlock *prev;
228 u32int heap; /* index in heap table */
229 u32int used; /* last reference times */
230 u32int used2;
231 u32int ref; /* reference count */
232 RWLock lock; /* for access to data only */
233 Channel *writedonechan;
234 void* chanbuf[1]; /* buffer for the chan! */
235 };
237 /*
238 * a cached block from the partition
239 * yuck -- most of this is internal structure for the cache
240 * all other routines should only use data
241 * double yuck -- this is mostly the same as a DBlock
242 */
243 struct Lump
245 Packet *data;
247 Part *part; /* partition in which cached */
248 u8int score[VtScoreSize]; /* score of packet */
249 u8int type; /* type of packet */
250 u32int size; /* amount of data allocated to hold packet */
251 Lump *next; /* doubly linked hash chains */
252 Lump *prev;
253 u32int heap; /* index in heap table */
254 u32int used; /* last reference times */
255 u32int used2;
256 u32int ref; /* reference count */
257 QLock lock; /* for access to data only */
258 };
260 /*
261 * mapping between names and address ranges
262 */
263 struct AMap
265 u64int start;
266 u64int stop;
267 char name[ANameSize];
268 };
270 /*
271 * an AMap along with a length
272 */
273 struct AMapN
275 int n;
276 AMap *map;
277 };
279 /*
280 * an ArenaPart is a partition made up of Arenas
281 * it exists because most os's don't support many partitions,
282 * and we want to have many different Arenas
283 */
284 struct ArenaPart
286 Part *part;
287 u64int size; /* size of underlying partition, rounded down to blocks */
288 Arena **arenas;
289 u32int tabbase; /* base address of arena table on disk */
290 u32int tabsize; /* max. bytes in arena table */
292 /*
293 * fields stored on disk
294 */
295 u32int version;
296 u32int blocksize; /* "optimal" block size for reads and writes */
297 u32int arenabase; /* base address of first arena */
299 /*
300 * stored in the arena mapping table on disk
301 */
302 AMap *map;
303 int narenas;
304 };
306 /*
307 * info about one block in the clump info cache
308 */
309 struct CIBlock
311 u32int block; /* blocks in the directory */
312 int offset; /* offsets of one clump in the data */
313 DBlock *data;
314 };
316 /*
317 * Statistics kept in the tail.
318 */
319 struct ATailStats
321 u32int clumps; /* number of clumps */
322 u32int cclumps; /* number of compressed clumps */
323 u64int used;
324 u64int uncsize;
325 u8int sealed;
326 };
328 /*
329 * Arena state - represents a point in the data log
330 */
331 struct AState
333 Arena *arena;
334 u64int aa; /* index address */
335 ATailStats stats;
336 };
338 /*
339 * an Arena is a log of Clumps, preceeded by an ArenaHeader,
340 * and followed by a Arena, each in one disk block.
341 * struct on disk is not always up to date, but should be self-consistent.
342 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
343 * <struct name="Arena" type="Arena *">
344 * <field name="name" val="s->name" type="AName"/>
345 * <field name="version" val="s->version" type="U32int"/>
346 * <field name="partition" val="s->part->name" type="AName"/>
347 * <field name="blocksize" val="s->blocksize" type="U32int"/>
348 * <field name="start" val="s->base" type="U64int"/>
349 * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
350 * <field name="created" val="s->ctime" type="U32int"/>
351 * <field name="modified" val="s->wtime" type="U32int"/>
352 * <field name="sealed" val="s->sealed" type="Sealed"/>
353 * <field name="score" val="s->score" type="Score"/>
354 * <field name="clumps" val="s->clumps" type="U32int"/>
355 * <field name="compressedclumps" val="s->cclumps" type="U32int"/>
356 * <field name="data" val="s->uncsize" type="U64int"/>
357 * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
358 * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
359 * </struct>
360 */
361 struct Arena
363 QLock lock; /* lock for arena fields, writing to disk */
364 Part *part; /* partition in which arena lives */
365 int blocksize; /* size of block to read or write */
366 u64int base; /* base address on disk */
367 u64int size; /* total space in the arena */
368 u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
370 int clumpmax; /* ClumpInfos per block */
371 AState mem;
372 int inqueue;
374 /*
375 * fields stored on disk
376 */
377 u32int version;
378 char name[ANameSize]; /* text label */
379 ATailStats memstats;
380 ATailStats diskstats;
381 u32int ctime; /* first time a block was written */
382 u32int wtime; /* last time a block was written */
383 u32int clumpmagic;
385 ArenaCIG *cig;
386 int ncig;
387 };
389 struct ArenaCIG
391 u64int offset; // from arena base
392 };
394 /*
395 * redundant storage of some fields at the beginning of each arena
396 */
397 struct ArenaHead
399 u32int version;
400 char name[ANameSize];
401 u32int blocksize;
402 u64int size;
403 u32int clumpmagic;
404 };
406 /*
407 * most interesting meta information for a clump.
408 * stored in each clump's header and in the Arena's directory,
409 * stored in reverse order just prior to the arena trailer
410 */
411 struct ClumpInfo
413 u8int type;
414 u16int size; /* size of disk data, not including header */
415 u16int uncsize; /* size of uncompressed data */
416 u8int score[VtScoreSize]; /* score of the uncompressed data only */
417 };
419 /*
420 * header for an immutable clump of data
421 */
422 struct Clump
424 ClumpInfo info;
425 u8int encoding;
426 u32int creator; /* initial client which wrote the block */
427 u32int time; /* creation at gmt seconds since 1/1/1970 */
428 };
430 /*
431 * index of all clumps according to their score
432 * this is just a wrapper to tie together the index sections
433 * <struct name="Index" type="Index *">
434 * <field name="name" val="s->name" type="AName"/>
435 * <field name="version" val="s->version" type="U32int"/>
436 * <field name="blocksize" val="s->blocksize" type="U32int"/>
437 * <field name="tabsize" val="s->tabsize" type="U32int"/>
438 * <field name="buckets" val="s->buckets" type="U32int"/>
439 * <field name="buckdiv" val="s->div" type="U32int"/>
440 * <field name="bitblocks" val="s->div" type="U32int"/>
441 * <field name="maxdepth" val="s->div" type="U32int"/>
442 * <field name="bitkeylog" val="s->div" type="U32int"/>
443 * <field name="bitkeymask" val="s->div" type="U32int"/>
444 * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
445 * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
446 * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
447 * </struct>
448 * <struct name="Amap" type="AMap *">
449 * <field name="name" val="s->name" type="AName"/>
450 * <field name="start" val="s->start" type="U64int"/>
451 * <field name="stop" val="s->stop" type="U64int"/>
452 * </struct>
453 */
454 struct Index
456 u32int div; /* divisor for mapping score to bucket */
457 u32int buckets; /* last bucket used in disk hash table */
458 u32int blocksize;
459 u32int tabsize; /* max. bytes in index config */
461 int mapalloc; /* first arena to check when adding a lump */
462 Arena **arenas; /* arenas in the mapping */
463 ISect **sects; /* sections which hold the buckets */
464 Bloom *bloom; /* bloom filter */
466 /*
467 * fields stored in config file
468 */
469 u32int version;
470 char name[ANameSize]; /* text label */
471 int nsects;
472 AMap *smap; /* mapping of buckets to index sections */
473 int narenas;
474 AMap *amap; /* mapping from index addesses to arenas */
476 QLock writing;
477 };
479 /*
480 * one part of the bucket storage for an index.
481 * the index blocks are sequentially allocated
482 * across all of the sections.
483 */
484 struct ISect
486 Part *part;
487 int blocklog; /* log2(blocksize) */
488 int buckmax; /* max. entries in a index bucket */
489 u32int tabbase; /* base address of index config table on disk */
490 u32int tabsize; /* max. bytes in index config */
491 Channel *writechan;
492 Channel *writedonechan;
493 void *ig; /* used by buildindex only */
494 int ng;
496 /*
497 * fields stored on disk
498 */
499 u32int version;
500 u32int bucketmagic;
501 char name[ANameSize]; /* text label */
502 char index[ANameSize]; /* index owning the section */
503 u32int blocksize; /* size of hash buckets in index */
504 u32int blockbase; /* address of start of on disk index table */
505 u32int blocks; /* total blocks on disk; some may be unused */
506 u32int start; /* first bucket in this section */
507 u32int stop; /* limit of buckets in this section */
508 };
510 /*
511 * externally interesting part of an IEntry
512 */
513 struct IAddr
515 u64int addr;
516 u16int size; /* uncompressed size */
517 u8int type; /* type of block */
518 u8int blocks; /* arena io quanta for Clump + data */
519 };
521 /*
522 * entries in the index
523 * kept in IBuckets in the disk index table,
524 * cached in the memory ICache.
525 */
526 struct IEntry
528 /* on disk data - 32 bytes*/
529 u8int score[VtScoreSize];
530 IAddr ia;
532 IEntry *nexthash;
533 IEntry *nextdirty;
534 IEntry *next;
535 IEntry *prev;
536 u8int state;
537 };
538 enum {
539 IEClean = 0,
540 IEDirty = 1,
541 IESummary = 2,
542 };
544 /*
545 * buckets in the on disk index table
546 */
547 struct IBucket
549 u16int n; /* number of active indices */
550 u32int buck; /* used by buildindex/checkindex only */
551 u8int *data;
552 };
554 /*
555 * temporary buffers used by individual threads
556 */
557 struct ZBlock
559 u32int len;
560 u32int _size;
561 u8int *data;
562 u8int *free;
563 };
565 /*
566 * simple input buffer for a '\0' terminated text file
567 */
568 struct IFile
570 char *name; /* name of the file */
571 ZBlock *b; /* entire contents of file */
572 u32int pos; /* current position in the file */
573 };
575 struct Statdesc
577 char *name;
578 ulong max;
579 };
581 /* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/
582 enum
584 StatRpcTotal,
585 StatRpcRead,
586 StatRpcReadOk,
587 StatRpcReadFail,
588 StatRpcReadBytes,
589 StatRpcReadTime,
590 StatRpcReadCached,
591 StatRpcReadCachedTime,
592 StatRpcReadUncached,
593 StatRpcReadUncachedTime,
594 StatRpcWrite,
595 StatRpcWriteNew,
596 StatRpcWriteOld,
597 StatRpcWriteFail,
598 StatRpcWriteBytes,
599 StatRpcWriteTime,
600 StatRpcWriteNewTime,
601 StatRpcWriteOldTime,
603 StatLcacheHit,
604 StatLcacheMiss,
605 StatLcacheRead,
606 StatLcacheWrite,
607 StatLcacheSize,
608 StatLcacheStall,
609 StatLcacheReadTime,
611 StatDcacheHit,
612 StatDcacheMiss,
613 StatDcacheLookup,
614 StatDcacheRead,
615 StatDcacheWrite,
616 StatDcacheDirty,
617 StatDcacheSize,
618 StatDcacheFlush,
619 StatDcacheStall,
620 StatDcacheLookupTime,
622 StatDblockStall,
623 StatLumpStall,
625 StatIcacheHit,
626 StatIcacheMiss,
627 StatIcacheRead,
628 StatIcacheWrite,
629 StatIcacheFill,
630 StatIcachePrefetch,
631 StatIcacheDirty,
632 StatIcacheSize,
633 StatIcacheFlush,
634 StatIcacheStall,
635 StatIcacheReadTime,
636 StatIcacheLookup,
637 StatScacheHit,
638 StatScachePrefetch,
640 StatBloomHit,
641 StatBloomMiss,
642 StatBloomFalseMiss,
643 StatBloomLookup,
644 StatBloomOnes,
645 StatBloomBits,
647 StatApartRead,
648 StatApartReadBytes,
649 StatApartWrite,
650 StatApartWriteBytes,
652 StatIsectRead,
653 StatIsectReadBytes,
654 StatIsectWrite,
655 StatIsectWriteBytes,
657 StatSumRead,
658 StatSumReadBytes,
660 StatCigLoad,
661 StatCigLoadTime,
663 NStat
664 };
666 extern Statdesc statdesc[NStat];
668 /*
669 * statistics about the operation of the server
670 * mainly for performance monitoring and profiling.
671 */
672 struct Stats
674 ulong now;
675 ulong n[NStat];
676 };
678 struct Statbin
680 uint nsamp;
681 uint min;
682 uint max;
683 uint avg;
684 };
686 struct Graph
688 long (*fn)(Stats*, Stats*, void*);
689 void *arg;
690 long t0;
691 long t1;
692 long min;
693 long max;
694 long wid;
695 long ht;
696 int fill;
697 };
699 /*
700 * for kicking background processes that run one round after another after another
701 */
702 struct Round
704 QLock lock;
705 Rendez start;
706 Rendez finish;
707 Rendez delaywait;
708 int delaytime;
709 int delaykick;
710 char* name;
711 int last;
712 int current;
713 int next;
714 int doanother;
715 };
717 /*
718 * Bloom filter of stored block hashes
719 */
720 struct Bloom
722 RWLock lk; /* protects nhash, nbits, tab, mb */
723 QLock mod; /* one marker at a time, protects nb */
724 int nhash;
725 ulong size; /* bytes in tab */
726 ulong bitmask; /* to produce bit index */
727 u8int *data;
728 Part *part;
729 Channel *writechan;
730 Channel *writedonechan;
731 };
733 extern Index *mainindex;
734 extern u32int maxblocksize; /* max. block size used by any partition */
735 extern int paranoid; /* should verify hashes on disk read */
736 extern int queuewrites; /* put all lump writes on a queue and finish later */
737 extern int readonly; /* only allowed to read the disk data */
738 extern Stats stats;
739 extern u8int zeroscore[VtScoreSize];
740 extern int compressblocks;
741 extern int writestodevnull; /* dangerous - for performance debugging */
742 extern int collectstats;
743 extern QLock memdrawlock;
744 extern int icachesleeptime;
745 extern int minicachesleeptime;
746 extern int arenasumsleeptime;
747 extern int manualscheduling;
748 extern int l0quantum;
749 extern int l1quantum;
750 extern int ignorebloom;
751 extern int icacheprefetch;
752 extern int syncwrites;
754 extern Stats *stathist;
755 extern int nstathist;
756 extern ulong stattime;
758 #ifndef PLAN9PORT
759 #pragma varargck type "V" uchar*
760 #define ODIRECT 0
761 #endif