Blob


1 typedef struct Config Config;
2 typedef struct AMap AMap;
3 typedef struct AMapN AMapN;
4 typedef struct Arena Arena;
5 typedef struct ArenaHead ArenaHead;
6 typedef struct ArenaPart ArenaPart;
7 typedef struct CIBlock CIBlock;
8 typedef struct Clump Clump;
9 typedef struct ClumpInfo ClumpInfo;
10 typedef struct IAddr IAddr;
11 typedef struct IBucket IBucket;
12 typedef struct ICache ICache;
13 typedef struct IEStream IEStream;
14 typedef struct IEntry IEntry;
15 typedef struct IFile IFile;
16 typedef struct ISect ISect;
17 typedef struct Index Index;
18 typedef struct Lump Lump;
19 typedef struct DBlock DBlock;
20 typedef struct Part Part;
21 typedef struct Stats Stats;
22 typedef struct ZBlock ZBlock;
24 #define TWID32 ((u32int)~(u32int)0)
25 #define TWID64 ((u64int)~(u64int)0)
26 #define TWID8 ((u8int)~(u8int)0)
28 enum
29 {
30 ABlockLog = 9, /* log2(512), the quantum for reading arenas */
31 ANameSize = 64,
32 MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
33 MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
34 PartBlank = 256*1024, /* untouched section at beginning of partition */
35 HeadSize = 512, /* size of a header after PartBlank */
36 MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
37 IndexBase = 1024*1024, /* initial address to use in an index */
38 MaxIo = 64*1024, /* max size of a single read or write operation */
39 ICacheBits = 16, /* default bits for indexing icache */
40 ICacheDepth = 4, /* default depth of an icache hash chain */
41 MaxAMap = 2*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
43 /*
44 * return codes from syncarena
45 */
46 SyncDataErr = 1 << 0, /* problem reading the clump data */
47 SyncCIErr = 1 << 1, /* found erroneous clump directory entries */
48 SyncCIZero = 1 << 2, /* found unwritten clump directory entries */
49 SyncFixErr = 1 << 3, /* error writing fixed data */
50 SyncHeader = 1 << 4, /* altered header fields */
52 /*
53 * error severity
54 */
55 EOk = 0, /* error expected in normal operation */
56 EStrange, /* strange error that should be logged */
57 ECorrupt, /* corrupted data found in arenas */
58 EICorrupt, /* corrupted data found in index */
59 EAdmin, /* should be brought to administrators' attention */
60 ECrash, /* really bad internal error */
61 EBug, /* a limitation which should be fixed */
62 EInconsist, /* inconsistencies between index and arena */
63 EMax,
65 /*
66 * internal disk formats for the venti archival storage system
67 */
68 /*
69 * magic numbers on disk
70 */
71 ClumpMagic = 0xd15cb10c, /* clump header */
72 ClumpFreeMagic = 0, /* free clump; terminates active clump log */
74 ArenaPartMagic = 0xa9e4a5e7, /* arena partition header */
75 ArenaMagic = 0xf2a14ead, /* arena trailer */
76 ArenaHeadMagic = 0xd15c4ead, /* arena header */
78 ISectMagic = 0xd15c5ec7, /* index header */
80 ArenaPartVersion = 3,
81 ArenaVersion = 4,
82 IndexVersion1 = 1,
83 IndexVersion2 = 2,
84 ISectVersion = 1,
86 /*
87 * encodings of clumps on disk
88 */
89 ClumpEErr = 0, /* can't happen */
90 ClumpENone, /* plain */
91 ClumpECompress, /* compressed */
92 ClumpEMax,
94 /*
95 * marker for corrupted data on disk
96 */
97 VtTypeCorrupt = VtMaxType,
99 /*
100 * sizes in bytes on disk
101 */
102 U8Size = 1,
103 U16Size = 2,
104 U32Size = 4,
105 U64Size = 8,
107 ArenaPartSize = 4 * U32Size,
108 ArenaSize = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
109 ArenaHeadSize = U64Size + 3 * U32Size + ANameSize,
110 ISectSize = 7 * U32Size + 2 * ANameSize,
111 ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize,
112 ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size,
113 IBucketSize = U32Size + U16Size,
114 IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
115 IEntryTypeOff = VtScoreSize + U64Size + U32Size + 2 * U16Size,
117 MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
119 /*
120 * dirty flags - order controls disk write order
121 */
122 DirtyArena = 1,
123 DirtyIndexSplit,
124 DirtyIndex,
125 DirtyIndexBitmap,
126 DirtyArenaCib,
127 DirtyArenaTrailer,
128 DirtyMax,
130 VentiZZZZZZZZ
131 };
133 /*
134 * results of parsing and initializing a config file
135 */
136 struct Config
138 char *index; /* name of the index to initialize */
139 int naparts; /* arena partitions initialized */
140 ArenaPart **aparts;
141 int nsects; /* index sections initialized */
142 ISect **sects;
143 };
145 /*
146 * a Part is the low level interface to files or disks.
147 * there are two main types of partitions
148 * arena paritions, which some number of arenas, each in a sub-partition.
149 * index partition, which only have one subpartition.
150 */
151 struct Part
153 int fd; /* rock for accessing the disk */
154 u64int size; /* size of the partiton */
155 u32int blocksize; /* block size for reads and writes */
156 char *name;
157 Channel *writechan; /* chan[dcache.nblock](DBlock*) */
158 };
160 /*
161 * a cached block from the partition
162 * yuck -- most of this is internal structure for the cache
163 * all other routines should only use data
164 */
165 struct DBlock
167 u8int *data;
169 Part *part; /* partition in which cached */
170 u64int addr; /* base address on the partition */
171 u16int size; /* amount of data available, not amount allocated; should go away */
172 u32int dirty;
173 u32int dirtying;
174 DBlock *next; /* doubly linked hash chains */
175 DBlock *prev;
176 u32int heap; /* index in heap table */
177 u32int used; /* last reference times */
178 u32int used2;
179 u32int ref; /* reference count */
180 QLock lock; /* for access to data only */
181 Channel writedonechan;
182 void* chanbuf[1]; /* buffer for the chan! */
183 };
185 /*
186 * a cached block from the partition
187 * yuck -- most of this is internal structure for the cache
188 * all other routines should only use data
189 * double yuck -- this is mostly the same as a DBlock
190 */
191 struct Lump
193 Packet *data;
195 Part *part; /* partition in which cached */
196 u8int score[VtScoreSize]; /* score of packet */
197 u8int type; /* type of packet */
198 u16int size; /* amount of data allocated to hold packet */
199 Lump *next; /* doubly linked hash chains */
200 Lump *prev;
201 u32int heap; /* index in heap table */
202 u32int used; /* last reference times */
203 u32int used2;
204 u32int ref; /* reference count */
205 QLock lock; /* for access to data only */
206 };
208 /*
209 * mapping between names and address ranges
210 */
211 struct AMap
213 u64int start;
214 u64int stop;
215 char name[ANameSize];
216 };
218 /*
219 * an AMap along with a length
220 */
221 struct AMapN
223 int n;
224 AMap *map;
225 };
227 /*
228 * an ArenaPart is a partition made up of Arenas
229 * it exists because most os's don't support many partitions,
230 * and we want to have many different Arenas
231 */
232 struct ArenaPart
234 Part *part;
235 u64int size; /* size of underlying partition, rounded down to blocks */
236 Arena **arenas;
237 u32int tabbase; /* base address of arena table on disk */
238 u32int tabsize; /* max. bytes in arena table */
240 /*
241 * fields stored on disk
242 */
243 u32int version;
244 u32int blocksize; /* "optimal" block size for reads and writes */
245 u32int arenabase; /* base address of first arena */
247 /*
248 * stored in the arena mapping table on disk
249 */
250 AMap *map;
251 int narenas;
252 };
254 /*
255 * info about one block in the clump info cache
256 */
257 struct CIBlock
259 u32int block; /* blocks in the directory */
260 int offset; /* offsets of one clump in the data */
261 DBlock *data;
262 };
264 /*
265 * an Arena is a log of Clumps, preceeded by an ArenaHeader,
266 * and followed by a Arena, each in one disk block.
267 * struct on disk is not always up to date, but should be self-consistent.
268 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
269 * <struct name="Arena" type="Arena *">
270 * <field name="name" val="s->name" type="AName"/>
271 * <field name="version" val="s->version" type="U32int"/>
272 * <field name="partition" val="s->part->name" type="AName"/>
273 * <field name="blocksize" val="s->blocksize" type="U32int"/>
274 * <field name="start" val="s->base" type="U64int"/>
275 * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
276 * <field name="created" val="s->ctime" type="U32int"/>
277 * <field name="modified" val="s->wtime" type="U32int"/>
278 * <field name="sealed" val="s->sealed" type="Sealed"/>
279 * <field name="score" val="s->score" type="Score"/>
280 * <field name="clumps" val="s->clumps" type="U32int"/>
281 * <field name="compressedclumps" val="s->cclumps" type="U32int"/>
282 * <field name="data" val="s->uncsize" type="U64int"/>
283 * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
284 * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
285 * </struct>
286 */
287 struct Arena
289 QLock lock; /* lock for arena fields, writing to disk */
290 Part *part; /* partition in which arena lives */
291 int blocksize; /* size of block to read or write */
292 u64int base; /* base address on disk */
293 u64int size; /* total space in the arena */
294 u64int limit; /* storage limit for clumps */
295 u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
297 int clumpmax; /* ClumpInfos per block */
298 CIBlock cib; /* dirty clump directory block */
300 /*
301 * fields stored on disk
302 */
303 u32int version;
304 char name[ANameSize]; /* text label */
305 u32int clumps; /* number of allocated clumps */
306 u32int cclumps; /* clumps which are compressed; informational only */
307 u32int ctime; /* first time a block was written */
308 u32int wtime; /* last time a block was written */
309 u64int used; /* number of bytes currently used */
310 u64int uncsize; /* total of all clumps's uncsize; informational only */
311 u8int sealed; /* arena all filled up? */
312 };
314 /*
315 * redundant storage of some fields at the beginning of each arena
316 */
317 struct ArenaHead
319 u32int version;
320 char name[ANameSize];
321 u32int blocksize;
322 u64int size;
323 };
325 /*
326 * most interesting meta information for a clump.
327 * stored in each clump's header and in the Arena's directory,
328 * stored in reverse order just prior to the arena trailer
329 */
330 struct ClumpInfo
332 u8int type;
333 u16int size; /* size of disk data, not including header */
334 u16int uncsize; /* size of uncompressed data */
335 u8int score[VtScoreSize]; /* score of the uncompressed data only */
336 };
338 /*
339 * header for an immutable clump of data
340 */
341 struct Clump
343 ClumpInfo info;
344 u8int encoding;
345 u32int creator; /* initial client which wrote the block */
346 u32int time; /* creation at gmt seconds since 1/1/1970 */
347 };
349 /*
350 * index of all clumps according to their score
351 * this is just a wrapper to tie together the index sections
352 * <struct name="Index" type="Index *">
353 * <field name="name" val="s->name" type="AName"/>
354 * <field name="version" val="s->version" type="U32int"/>
355 * <field name="blocksize" val="s->blocksize" type="U32int"/>
356 * <field name="tabsize" val="s->tabsize" type="U32int"/>
357 * <field name="buckets" val="s->buckets" type="U32int"/>
358 * <field name="buckdiv" val="s->div" type="U32int"/>
359 * <field name="bitblocks" val="s->div" type="U32int"/>
360 * <field name="maxdepth" val="s->div" type="U32int"/>
361 * <field name="bitkeylog" val="s->div" type="U32int"/>
362 * <field name="bitkeymask" val="s->div" type="U32int"/>
363 * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
364 * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
365 * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
366 * </struct>
367 * <struct name="Amap" type="AMap *">
368 * <field name="name" val="s->name" type="AName"/>
369 * <field name="start" val="s->start" type="U64int"/>
370 * <field name="stop" val="s->stop" type="U64int"/>
371 * </struct>
372 */
373 struct Index
375 u32int div; /* divisor for mapping score to bucket */
376 u32int buckets; /* last bucket used in disk hash table */
377 u32int blocksize;
378 u32int tabsize; /* max. bytes in index config */
379 u32int bitblocks;
380 u32int maxdepth;
381 u32int bitkeylog;
382 u32int bitkeymask;
384 int mapalloc; /* first arena to check when adding a lump */
385 Arena **arenas; /* arenas in the mapping */
386 ISect **sects; /* sections which hold the buckets */
388 /*
389 * fields stored in config file
390 */
391 u32int version;
392 char name[ANameSize]; /* text label */
393 int nsects;
394 AMap *smap; /* mapping of buckets to index sections */
395 int narenas;
396 AMap *amap; /* mapping from index addesses to arenas */
397 };
399 /*
400 * one part of the bucket storage for an index.
401 * the index blocks are sequentially allocated
402 * across all of the sections.
403 */
404 struct ISect
406 Part *part;
407 int blocklog; /* log2(blocksize) */
408 int buckmax; /* max. entries in a index bucket */
409 u32int tabbase; /* base address of index config table on disk */
410 u32int tabsize; /* max. bytes in index config */
412 /*
413 * fields stored on disk
414 */
415 u32int version;
416 char name[ANameSize]; /* text label */
417 char index[ANameSize]; /* index owning the section */
418 u32int blocksize; /* size of hash buckets in index */
419 u32int blockbase; /* address of start of on disk index table */
420 u32int blocks; /* total blocks on disk; some may be unused */
421 u32int start; /* first bucket in this section */
422 u32int stop; /* limit of buckets in this section */
423 };
425 /*
426 * externally interesting part of an IEntry
427 */
428 struct IAddr
430 u64int addr;
431 u16int size; /* uncompressed size */
432 u8int type; /* type of block */
433 u8int blocks; /* arena io quanta for Clump + data */
434 };
436 /*
437 * entries in the index
438 * kept in IBuckets in the disk index table,
439 * cached in the memory ICache.
440 */
441 struct IEntry
443 u8int score[VtScoreSize];
444 IEntry *next; /* next in hash chain */
445 u32int wtime; /* last write time */
446 u16int train; /* relative train containing the most recent ref; 0 if no ref, 1 if in same car */
447 u8int rac; /* read ahead count */
448 IAddr ia;
449 };
451 /*
452 * buckets in the on disk index table
453 */
454 struct IBucket
456 u16int n; /* number of active indices */
457 u32int depth; /* depth in version 2 (was overflow in v1) */
458 u8int *data;
459 };
461 /*
462 * temporary buffers used by individual threads
463 */
464 struct ZBlock
466 u32int len;
467 u8int *data;
468 };
470 /*
471 * simple input buffer for a '\0' terminated text file
472 */
473 struct IFile
475 char *name; /* name of the file */
476 ZBlock *b; /* entire contents of file */
477 u32int pos; /* current position in the file */
478 };
480 /*
481 * statistics about the operation of the server
482 * mainly for performance monitoring and profiling.
483 */
484 struct Stats
486 QLock lock;
487 long lumpwrites; /* protocol block writes */
488 long lumpreads; /* protocol block reads */
489 long lumphit; /* lump cache hit */
490 long lumpmiss; /* lump cache miss */
491 long clumpwrites; /* clumps to disk */
492 vlong clumpbwrites; /* clump data bytes to disk */
493 vlong clumpbcomp; /* clump bytes compressed */
494 long clumpreads; /* clumps from disk */
495 vlong clumpbreads; /* clump data bytes from disk */
496 vlong clumpbuncomp; /* clump bytes uncompressed */
497 long ciwrites; /* clump directory to disk */
498 long cireads; /* clump directory from disk */
499 long indexwrites; /* index to disk */
500 long indexreads; /* index from disk */
501 long indexwreads; /* for writing a new entry */
502 long indexareads; /* for allocating an overflow block */
503 long indexsplits; /* index block splits */
504 long diskwrites; /* total disk writes */
505 long diskreads; /* total disk reads */
506 vlong diskbwrites; /* total disk bytes written */
507 vlong diskbreads; /* total disk bytes read */
508 long pchit; /* partition cache hit */
509 long pcmiss; /* partition cache miss */
510 long pcreads; /* partition cache reads from disk */
511 vlong pcbreads; /* partition cache bytes read */
512 long icinserts; /* stores into index cache */
513 long iclookups; /* index cache lookups */
514 long ichits; /* hits in the cache */
515 long icfills; /* successful fills from index */
516 long absorbedwrites; /* disk writes absorbed by dcache */
517 long dirtydblocks; /* blocks dirtied */
518 long dcacheflushes; /* times dcache has flushed */
519 long dcacheflushwrites; /* blocks written by those flushes */
520 };
522 extern Index *mainindex;
523 extern u32int maxblocksize; /* max. block size used by any partition */
524 extern int paranoid; /* should verify hashes on disk read */
525 extern int queuewrites; /* put all lump writes on a queue and finish later */
526 extern int readonly; /* only allowed to read the disk data */
527 extern Stats stats;
528 extern u8int zeroscore[VtScoreSize];