Blob


1 typedef struct Config Config;
2 typedef struct AMap AMap;
3 typedef struct AMapN AMapN;
4 typedef struct Arena Arena;
5 typedef struct AState AState;
6 typedef struct ArenaHead ArenaHead;
7 typedef struct ArenaPart ArenaPart;
8 typedef struct ArenaTail ArenaTail;
9 typedef struct ATailStats ATailStats;
10 typedef struct CIBlock CIBlock;
11 typedef struct Clump Clump;
12 typedef struct ClumpInfo ClumpInfo;
13 typedef struct Graph Graph;
14 typedef struct IAddr IAddr;
15 typedef struct IBucket IBucket;
16 typedef struct IEStream IEStream;
17 typedef struct IEntry IEntry;
18 typedef struct IFile IFile;
19 typedef struct ISect ISect;
20 typedef struct Index Index;
21 typedef struct Lump Lump;
22 typedef struct DBlock DBlock;
23 typedef struct Part Part;
24 typedef struct Statbin Statbin;
25 typedef struct Statdesc Statdesc;
26 typedef struct Stats Stats;
27 typedef struct ZBlock ZBlock;
28 typedef struct Round Round;
29 typedef struct Bloom Bloom;
31 #define TWID32 ((u32int)~(u32int)0)
32 #define TWID64 ((u64int)~(u64int)0)
33 #define TWID8 ((u8int)~(u8int)0)
35 enum
36 {
37 ABlockLog = 9, /* log2(512), the quantum for reading arenas */
38 ANameSize = 64,
39 MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
40 MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
41 PartBlank = 256*1024, /* untouched section at beginning of partition */
42 HeadSize = 512, /* size of a header after PartBlank */
43 MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
44 IndexBase = 1024*1024, /* initial address to use in an index */
45 MaxIo = 64*1024, /* max size of a single read or write operation */
46 ICacheBits = 16, /* default bits for indexing icache */
47 ICacheDepth = 4, /* default depth of an icache hash chain */
48 MaxAMap = 2*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
50 /*
51 * return codes from syncarena
52 */
53 SyncDataErr = 1 << 0, /* problem reading the clump data */
54 SyncCIErr = 1 << 1, /* found erroneous clump directory entries */
55 SyncCIZero = 1 << 2, /* found unwritten clump directory entries */
56 SyncFixErr = 1 << 3, /* error writing fixed data */
57 SyncHeader = 1 << 4, /* altered header fields */
59 /*
60 * error severity
61 */
62 EOk = 0, /* error expected in normal operation */
63 EStrange, /* strange error that should be logged */
64 ECorrupt, /* corrupted data found in arenas */
65 EICorrupt, /* corrupted data found in index */
66 EAdmin, /* should be brought to administrators' attention */
67 ECrash, /* really bad internal error */
68 EBug, /* a limitation which should be fixed */
69 EInconsist, /* inconsistencies between index and arena */
70 EMax,
72 /*
73 * internal disk formats for the venti archival storage system
74 */
75 /*
76 * magic numbers on disk
77 */
78 _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */
79 ClumpFreeMagic = 0, /* free clump; terminates active clump log */
81 ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */
82 ArenaMagic = 0xf2a14eadU, /* arena trailer */
83 ArenaHeadMagic = 0xd15c4eadU, /* arena header */
85 BloomMagic = 0xb1004eadU, /* bloom filter header */
86 BloomMaxHash = 32,
88 ISectMagic = 0xd15c5ec7U, /* index header */
90 ArenaPartVersion = 3,
91 ArenaVersion4 = 4,
92 ArenaVersion5 = 5,
93 BloomVersion = 1,
94 IndexVersion = 1,
95 ISectVersion1 = 1,
96 ISectVersion2 = 2,
98 /*
99 * encodings of clumps on disk
100 */
101 ClumpEErr = 0, /* can't happen */
102 ClumpENone, /* plain */
103 ClumpECompress, /* compressed */
104 ClumpEMax,
106 /*
107 * sizes in bytes on disk
108 */
109 U8Size = 1,
110 U16Size = 2,
111 U32Size = 4,
112 U64Size = 8,
114 ArenaPartSize = 4 * U32Size,
115 ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
116 ArenaSize5 = ArenaSize4 + U32Size,
117 ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize,
118 ArenaHeadSize5 = ArenaHeadSize4 + U32Size,
119 BloomHeadSize = 4 * U32Size,
120 ISectSize1 = 7 * U32Size + 2 * ANameSize,
121 ISectSize2 = ISectSize1 + U32Size,
122 ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize,
123 ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size,
124 MaxBloomSize = 1<<(32-3), /* 2^32 bits */
125 MaxBloomHash = 32, /* bits per score */
126 /*
127 * BUG - The various block copies that manipulate entry buckets
128 * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40,
129 * so that everything is word-aligned. Buildindex is actually cpu-bound
130 * by the (byte at a time) copying in qsort.
131 */
132 IBucketSize = U32Size + U16Size,
133 IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
134 IEntryTypeOff = VtScoreSize + U64Size + U32Size + 2 * U16Size,
136 MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
138 /*
139 * dirty flags - order controls disk write order
140 */
141 DirtyArena = 1,
142 DirtyArenaCib,
143 DirtyArenaTrailer,
144 DirtyMax,
146 VentiZZZZZZZZ
147 };
149 extern char TraceDisk[];
150 extern char TraceLump[];
151 extern char TraceBlock[];
152 extern char TraceProc[];
153 extern char TraceWork[];
154 extern char TraceQuiet[];
155 extern char TraceRpc[];
157 /*
158 * results of parsing and initializing a config file
159 */
160 struct Config
162 char *index; /* name of the index to initialize */
163 int naparts; /* arena partitions initialized */
164 ArenaPart **aparts;
165 int nsects; /* index sections initialized */
166 ISect **sects;
167 Bloom *bloom; /* bloom filter */
168 u32int bcmem;
169 u32int mem;
170 u32int icmem;
171 int queuewrites;
172 char* haddr;
173 char* vaddr;
174 char* webroot;
175 };
177 /*
178 * a Part is the low level interface to files or disks.
179 * there are two main types of partitions
180 * arena paritions, which some number of arenas, each in a sub-partition.
181 * index partition, which only have one subpartition.
182 */
183 struct Part
185 int fd; /* rock for accessing the disk */
186 int mode;
187 u64int offset;
188 u64int size; /* size of the partiton */
189 u32int blocksize; /* block size for reads and writes */
190 u32int fsblocksize; /* minimum file system block size */
191 char *name;
192 char *filename;
193 Channel *writechan; /* chan[dcache.nblock](DBlock*) */
194 };
196 /*
197 * a cached block from the partition
198 * yuck -- most of this is internal structure for the cache
199 * all other routines should only use data
200 */
201 struct DBlock
203 u8int *data;
205 Part *part; /* partition in which cached */
206 u64int addr; /* base address on the partition */
207 u32int size; /* amount of data available, not amount allocated; should go away */
208 u32int mode;
209 u32int dirty;
210 u32int dirtying;
211 DBlock *next; /* doubly linked hash chains */
212 DBlock *prev;
213 u32int heap; /* index in heap table */
214 u32int used; /* last reference times */
215 u32int used2;
216 u32int ref; /* reference count */
217 RWLock lock; /* for access to data only */
218 Channel *writedonechan;
219 void* chanbuf[1]; /* buffer for the chan! */
220 };
222 /*
223 * a cached block from the partition
224 * yuck -- most of this is internal structure for the cache
225 * all other routines should only use data
226 * double yuck -- this is mostly the same as a DBlock
227 */
228 struct Lump
230 Packet *data;
232 Part *part; /* partition in which cached */
233 u8int score[VtScoreSize]; /* score of packet */
234 u8int type; /* type of packet */
235 u32int size; /* amount of data allocated to hold packet */
236 Lump *next; /* doubly linked hash chains */
237 Lump *prev;
238 u32int heap; /* index in heap table */
239 u32int used; /* last reference times */
240 u32int used2;
241 u32int ref; /* reference count */
242 QLock lock; /* for access to data only */
243 };
245 /*
246 * mapping between names and address ranges
247 */
248 struct AMap
250 u64int start;
251 u64int stop;
252 char name[ANameSize];
253 };
255 /*
256 * an AMap along with a length
257 */
258 struct AMapN
260 int n;
261 AMap *map;
262 };
264 /*
265 * an ArenaPart is a partition made up of Arenas
266 * it exists because most os's don't support many partitions,
267 * and we want to have many different Arenas
268 */
269 struct ArenaPart
271 Part *part;
272 u64int size; /* size of underlying partition, rounded down to blocks */
273 Arena **arenas;
274 u32int tabbase; /* base address of arena table on disk */
275 u32int tabsize; /* max. bytes in arena table */
277 /*
278 * fields stored on disk
279 */
280 u32int version;
281 u32int blocksize; /* "optimal" block size for reads and writes */
282 u32int arenabase; /* base address of first arena */
284 /*
285 * stored in the arena mapping table on disk
286 */
287 AMap *map;
288 int narenas;
289 };
291 /*
292 * info about one block in the clump info cache
293 */
294 struct CIBlock
296 u32int block; /* blocks in the directory */
297 int offset; /* offsets of one clump in the data */
298 DBlock *data;
299 };
301 /*
302 * Statistics kept in the tail.
303 */
304 struct ATailStats
306 u32int clumps; /* number of clumps */
307 u32int cclumps; /* number of compressed clumps */
308 u64int used;
309 u64int uncsize;
310 u8int sealed;
311 };
313 /*
314 * Arena state - represents a point in the data log
315 */
316 struct AState
318 Arena *arena;
319 u64int aa; /* index address */
320 ATailStats stats;
321 };
323 /*
324 * an Arena is a log of Clumps, preceeded by an ArenaHeader,
325 * and followed by a Arena, each in one disk block.
326 * struct on disk is not always up to date, but should be self-consistent.
327 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
328 * <struct name="Arena" type="Arena *">
329 * <field name="name" val="s->name" type="AName"/>
330 * <field name="version" val="s->version" type="U32int"/>
331 * <field name="partition" val="s->part->name" type="AName"/>
332 * <field name="blocksize" val="s->blocksize" type="U32int"/>
333 * <field name="start" val="s->base" type="U64int"/>
334 * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
335 * <field name="created" val="s->ctime" type="U32int"/>
336 * <field name="modified" val="s->wtime" type="U32int"/>
337 * <field name="sealed" val="s->sealed" type="Sealed"/>
338 * <field name="score" val="s->score" type="Score"/>
339 * <field name="clumps" val="s->clumps" type="U32int"/>
340 * <field name="compressedclumps" val="s->cclumps" type="U32int"/>
341 * <field name="data" val="s->uncsize" type="U64int"/>
342 * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
343 * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
344 * </struct>
345 */
346 struct Arena
348 QLock lock; /* lock for arena fields, writing to disk */
349 Part *part; /* partition in which arena lives */
350 int blocksize; /* size of block to read or write */
351 u64int base; /* base address on disk */
352 u64int size; /* total space in the arena */
353 u64int limit; /* storage limit for clumps */
354 u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
356 int clumpmax; /* ClumpInfos per block */
357 AState mem;
358 int inqueue;
359 DigestState sha1;
361 /*
362 * fields stored on disk
363 */
364 u32int version;
365 char name[ANameSize]; /* text label */
366 ATailStats memstats;
367 ATailStats diskstats;
368 u32int ctime; /* first time a block was written */
369 u32int wtime; /* last time a block was written */
370 u32int clumpmagic;
371 };
373 /*
374 * redundant storage of some fields at the beginning of each arena
375 */
376 struct ArenaHead
378 u32int version;
379 char name[ANameSize];
380 u32int blocksize;
381 u64int size;
382 u32int clumpmagic;
383 };
385 /*
386 * most interesting meta information for a clump.
387 * stored in each clump's header and in the Arena's directory,
388 * stored in reverse order just prior to the arena trailer
389 */
390 struct ClumpInfo
392 u8int type;
393 u16int size; /* size of disk data, not including header */
394 u16int uncsize; /* size of uncompressed data */
395 u8int score[VtScoreSize]; /* score of the uncompressed data only */
396 };
398 /*
399 * header for an immutable clump of data
400 */
401 struct Clump
403 ClumpInfo info;
404 u8int encoding;
405 u32int creator; /* initial client which wrote the block */
406 u32int time; /* creation at gmt seconds since 1/1/1970 */
407 };
409 /*
410 * index of all clumps according to their score
411 * this is just a wrapper to tie together the index sections
412 * <struct name="Index" type="Index *">
413 * <field name="name" val="s->name" type="AName"/>
414 * <field name="version" val="s->version" type="U32int"/>
415 * <field name="blocksize" val="s->blocksize" type="U32int"/>
416 * <field name="tabsize" val="s->tabsize" type="U32int"/>
417 * <field name="buckets" val="s->buckets" type="U32int"/>
418 * <field name="buckdiv" val="s->div" type="U32int"/>
419 * <field name="bitblocks" val="s->div" type="U32int"/>
420 * <field name="maxdepth" val="s->div" type="U32int"/>
421 * <field name="bitkeylog" val="s->div" type="U32int"/>
422 * <field name="bitkeymask" val="s->div" type="U32int"/>
423 * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
424 * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
425 * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
426 * </struct>
427 * <struct name="Amap" type="AMap *">
428 * <field name="name" val="s->name" type="AName"/>
429 * <field name="start" val="s->start" type="U64int"/>
430 * <field name="stop" val="s->stop" type="U64int"/>
431 * </struct>
432 */
433 struct Index
435 u32int div; /* divisor for mapping score to bucket */
436 u32int buckets; /* last bucket used in disk hash table */
437 u32int blocksize;
438 u32int tabsize; /* max. bytes in index config */
439 u32int bitblocks; //XXX remove these fields
440 u32int maxdepth;
441 u32int bitkeylog;
442 u32int bitkeymask;
444 int mapalloc; /* first arena to check when adding a lump */
445 Arena **arenas; /* arenas in the mapping */
446 ISect **sects; /* sections which hold the buckets */
447 Bloom *bloom; /* bloom filter */
449 /*
450 * fields stored in config file
451 */
452 u32int version;
453 char name[ANameSize]; /* text label */
454 int nsects;
455 AMap *smap; /* mapping of buckets to index sections */
456 int narenas;
457 AMap *amap; /* mapping from index addesses to arenas */
458 };
460 /*
461 * one part of the bucket storage for an index.
462 * the index blocks are sequentially allocated
463 * across all of the sections.
464 */
465 struct ISect
467 Part *part;
468 int blocklog; /* log2(blocksize) */
469 int buckmax; /* max. entries in a index bucket */
470 u32int tabbase; /* base address of index config table on disk */
471 u32int tabsize; /* max. bytes in index config */
472 Channel *writechan;
473 Channel *writedonechan;
475 /*
476 * fields stored on disk
477 */
478 u32int version;
479 u32int bucketmagic;
480 char name[ANameSize]; /* text label */
481 char index[ANameSize]; /* index owning the section */
482 u32int blocksize; /* size of hash buckets in index */
483 u32int blockbase; /* address of start of on disk index table */
484 u32int blocks; /* total blocks on disk; some may be unused */
485 u32int start; /* first bucket in this section */
486 u32int stop; /* limit of buckets in this section */
487 };
489 /*
490 * externally interesting part of an IEntry
491 */
492 struct IAddr
494 u64int addr;
495 u16int size; /* uncompressed size */
496 u8int type; /* type of block */
497 u8int blocks; /* arena io quanta for Clump + data */
498 };
500 /*
501 * entries in the index
502 * kept in IBuckets in the disk index table,
503 * cached in the memory ICache.
504 */
505 struct IEntry
507 u8int score[VtScoreSize];
508 IEntry *next; /* next in hash chain */
509 IEntry *nextdirty; /* next in dirty chain */
510 u32int wtime; /* last write time */
511 u16int train; /* relative train containing the most recent ref; 0 if no ref, 1 if in same car */
512 u8int rac; /* read ahead count */
513 u8int dirty; /* is dirty */
514 IAddr ia;
515 };
517 /*
518 * buckets in the on disk index table
519 */
520 struct IBucket
522 u16int n; /* number of active indices */
523 u32int buck; /* used by buildindex/checkindex only */
524 u8int *data;
525 };
527 /*
528 * temporary buffers used by individual threads
529 */
530 struct ZBlock
532 u32int len;
533 u32int _size;
534 u8int *data;
535 u8int *free;
536 };
538 /*
539 * simple input buffer for a '\0' terminated text file
540 */
541 struct IFile
543 char *name; /* name of the file */
544 ZBlock *b; /* entire contents of file */
545 u32int pos; /* current position in the file */
546 };
548 struct Statdesc
550 char *name;
551 ulong max;
552 };
554 /* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/
555 enum
557 StatRpcTotal,
558 StatRpcRead,
559 StatRpcReadOk,
560 StatRpcReadFail,
561 StatRpcReadBytes,
562 StatRpcReadTime,
563 StatRpcReadCached,
564 StatRpcReadCachedTime,
565 StatRpcReadUncached,
566 StatRpcReadUncachedTime,
567 StatRpcWrite,
568 StatRpcWriteNew,
569 StatRpcWriteOld,
570 StatRpcWriteFail,
571 StatRpcWriteBytes,
572 StatRpcWriteTime,
573 StatRpcWriteNewTime,
574 StatRpcWriteOldTime,
576 StatLcacheHit,
577 StatLcacheMiss,
578 StatLcacheRead,
579 StatLcacheWrite,
580 StatLcacheSize,
581 StatLcacheStall,
582 StatLcacheReadTime,
584 StatDcacheHit,
585 StatDcacheMiss,
586 StatDcacheLookup,
587 StatDcacheRead,
588 StatDcacheWrite,
589 StatDcacheDirty,
590 StatDcacheSize,
591 StatDcacheFlush,
592 StatDcacheStall,
593 StatDcacheLookupTime,
595 StatDblockStall,
596 StatLumpStall,
598 StatIcacheHit,
599 StatIcacheMiss,
600 StatIcacheRead,
601 StatIcacheWrite,
602 StatIcacheFill,
603 StatIcachePrefetch,
604 StatIcacheDirty,
605 StatIcacheSize,
606 StatIcacheFlush,
607 StatIcacheStall,
608 StatIcacheReadTime,
610 StatBloomHit,
611 StatBloomMiss,
612 StatBloomFalseMiss,
613 StatBloomLookup,
614 StatBloomOnes,
615 StatBloomBits,
616 StatBloomLookupTime,
618 StatApartRead,
619 StatApartReadBytes,
620 StatApartWrite,
621 StatApartWriteBytes,
623 StatIsectRead,
624 StatIsectReadBytes,
625 StatIsectWrite,
626 StatIsectWriteBytes,
628 StatSumRead,
629 StatSumReadBytes,
631 NStat
632 };
634 extern Statdesc statdesc[NStat];
636 /*
637 * statistics about the operation of the server
638 * mainly for performance monitoring and profiling.
639 */
640 struct Stats
642 ulong now;
643 ulong n[NStat];
644 };
646 struct Statbin
648 uint nsamp;
649 uint min;
650 uint max;
651 uint avg;
652 };
654 struct Graph
656 long (*fn)(Stats*, Stats*, void*);
657 void *arg;
658 long t0;
659 long t1;
660 long min;
661 long max;
662 long wid;
663 long ht;
664 int fill;
665 };
667 /*
668 * for kicking background processes that run one round after another after another
669 */
670 struct Round
672 QLock lock;
673 Rendez start;
674 Rendez finish;
675 Rendez delaywait;
676 int delaytime;
677 int delaykick;
678 char* name;
679 int last;
680 int current;
681 int next;
682 int doanother;
683 };
685 /*
686 * Bloom filter of stored block hashes
687 */
688 struct Bloom
690 RWLock lk; /* protects nhash, nbits, tab, mb */
691 QLock mod; /* one marker at a time, protects nb */
692 int nhash;
693 ulong size; /* bytes in tab */
694 ulong mask; /* to produce index */
695 u8int *data;
696 Part *part;
697 Channel *writechan;
698 Channel *writedonechan;
699 };
701 extern Index *mainindex;
702 extern u32int maxblocksize; /* max. block size used by any partition */
703 extern int paranoid; /* should verify hashes on disk read */
704 extern int queuewrites; /* put all lump writes on a queue and finish later */
705 extern int readonly; /* only allowed to read the disk data */
706 extern Stats stats;
707 extern u8int zeroscore[VtScoreSize];
708 extern int compressblocks;
709 extern int writestodevnull; /* dangerous - for performance debugging */
710 extern int collectstats;
711 extern QLock memdrawlock;
712 extern int icachesleeptime;
713 extern int arenasumsleeptime;
715 #ifndef PLAN9PORT
716 #pragma varargck type "V" uchar*
717 #define ODIRECT 0
718 #endif