|
1 # HG changeset patch |
|
2 # User Michael Gerdts <[email protected]> |
|
3 # Date 1412600364 25200 |
|
4 # Mon Oct 06 05:59:24 2014 -0700 |
|
5 # Node ID ba834c48cee26e4c43976af8477dd34863b40f18 |
|
6 # Parent 1debb63439545fd0b30153eb68e884623d06c531 |
|
7 parallel uncompress - developed by Oracle |
|
8 Offered to upstream at https://github.com/mgerdts/pigz |
|
9 - Branch mt-uncompress-2.2 forked from https://github.com/madler/pigz v. 2.2.6 |
|
10 - Branch mt-uncompress forked from https://github.com/madler/pigz branch master |
|
11 |
|
12 diff -r 1debb6343954 -r ba834c48cee2 Makefile |
|
13 --- a/Makefile |
|
14 +++ b/Makefile |
|
15 @@ -44,6 +44,15 @@ |
|
16 compress -f < pigz.c | ./unpigz | cmp - pigz.c ;\ |
|
17 fi |
|
18 @rm -f pigz.c.gz pigz.c.zz pigz.c.zip |
|
19 + @rm -rf d/1 d/2 |
|
20 + (mkdir -p d/1; cd d/1; tar xzf ../../../../pigz-2.2.5.tar.gz; \ |
|
21 + cd ..; cp -pr 1 2; ../pigz -rp 4 --index %z 1; \ |
|
22 + ../pigz -drp 4 --index %z 1; diff -r 1 2) |
|
23 + @rm -rf d/1 d/2 |
|
24 + (mkdir -p d/1; cd d/1; tar xzf ../../../../pigz-2.2.5.tar.gz; \ |
|
25 + cd ..; cp -pr 1 2; ../pigz -zrp 4 -X %f.idx 1; \ |
|
26 + ../pigz -dzrp 4 -X %f.idx 1; diff -r 1 2) |
|
27 + @rm -rf d/1 d/2 |
|
28 |
|
29 tests: dev test |
|
30 ./pigzn -kf pigz.c ; ./pigz -t pigz.c.gz |
|
31 diff -r 1debb6343954 -r ba834c48cee2 pigz.1 |
|
32 --- a/pigz.1 |
|
33 +++ b/pigz.1 |
|
34 @@ -180,6 +180,14 @@ |
|
35 .B -V --version |
|
36 Show the version of pigz. |
|
37 .TP |
|
38 +.B -X --index file |
|
39 +During compression, create an index that can be used for parallel |
|
40 +decompression. During decompression, use the specified index file for parallel |
|
41 +decompression. Each occurrence of %f and %z are replaced by the uncompressed |
|
42 +and compressed file names, respectively. If the index file is the same file as |
|
43 +the compressed file, the index is written to or read from the end of the |
|
44 +compressed file. |
|
45 +.TP |
|
46 .B -z --zlib |
|
47 Compress to zlib (.zz) instead of gzip format. |
|
48 .TP |
|
49 diff -r 1debb6343954 -r ba834c48cee2 pigz.c |
|
50 --- a/pigz.c |
|
51 +++ b/pigz.c |
|
52 @@ -191,13 +191,27 @@ |
|
53 effectiveness of deflating in a single thread. This can be turned off using |
|
54 the --independent or -i option, so that the blocks can be decompressed |
|
55 independently for partial error recovery or for random access. |
|
56 - |
|
57 - Decompression can't be parallelized, at least not without specially prepared |
|
58 - deflate streams for that purpose. As a result, pigz uses a single thread |
|
59 - (the main thread) for decompression, but will create three other threads for |
|
60 - reading, writing, and check calculation, which can speed up decompression |
|
61 - under some circumstances. Parallel decompression can be turned off by |
|
62 - specifying one process (-dp 1 or -tp 1). |
|
63 + |
|
64 + The --index or -X option causes the generation of a block index which can be |
|
65 + used for parallel decompression. The block index can be appended onto the |
|
66 + compressed output or it may be stored in a separate file. The uncompressed |
|
67 + size, compressed size, checksum of each block are stored in the index, |
|
68 + allowing future applications to perform random reads of the compressed file. |
|
69 + Streams generated with -X are readable by legacy versions of pigz and gzip. |
|
70 + |
|
71 + Decompression can be parallelized, but only if a block index is available. |
|
72 + If a block index is not present, pigz uses a single thread (the main thread) |
|
73 + for decompression, but will create three other threads for reading, writing, |
|
74 + and check calculation, which can speed up decompression under some |
|
75 + circumstances. Parallel decompression can be turned off by specifying one |
|
76 + process (-dp 1 or -tp 1). |
|
77 + |
|
78 + If the block index is present, the main thread reads the input file and |
|
79 + dispatches each block to an uncompress thread. The uncompress thread |
|
80 + uncompresses the block, verifies the block checksum, and passes the block |
|
81 + off to a writer thread. The writer thread writes the blocks in order, |
|
82 + and combines the individual block checksums into a per-file checksum. The |
|
83 + per-file checksum is compared to the checksum in the stream's trailer. |
|
84 |
|
85 pigz requires zlib 1.2.1 or later to allow setting the dictionary when doing |
|
86 raw deflate. Since zlib 1.2.3 corrects security vulnerabilities in zlib |
|
87 @@ -259,13 +273,14 @@ |
|
88 can't get way ahead of the write thread and build up a large backlog of |
|
89 unwritten compressed data. The write thread will write the compressed data, |
|
90 drop the output buffer, and then wait for the check value to be unlocked |
|
91 - by the compress thread. Then the write thread combines the check value for |
|
92 - this chunk with the total check value for eventual use in the trailer. If |
|
93 - this is not the last chunk, the write thread then goes back to look for the |
|
94 - next output chunk in sequence. After the last chunk, the write thread |
|
95 - returns and joins the main thread. Unlike the compress threads, a new write |
|
96 - thread is launched for each input stream. The write thread writes the |
|
97 - appropriate header and trailer around the compressed data. |
|
98 + by the compress thread. Then the write thread writes an index entry (if -X) |
|
99 + and combines the check value for this chunk with the total check value for |
|
100 + eventual use in the trailer. If this is not the last chunk, the write thread |
|
101 + then goes back to look for the next output chunk in sequence. After the last |
|
102 + chunk, the write thread returns and joins the main thread. Unlike the |
|
103 + compress threads, a new write thread is launched for each input stream. The |
|
104 + write thread writes the appropriate header and trailer around the compressed |
|
105 + data. |
|
106 |
|
107 The input and output buffers are reused through their collection in pools. |
|
108 Each buffer has a use count, which when decremented to zero returns the |
|
109 @@ -313,6 +328,9 @@ |
|
110 #if __STDC_VERSION__-0 >= 199901L || __GNUC__-0 >= 3 |
|
111 # include <inttypes.h> /* intmax_t */ |
|
112 #endif |
|
113 +#include <stddef.h> /* offsetof() */ |
|
114 +#include <sys/mman.h> /* mmap() */ |
|
115 +#include <netinet/in.h> /* htonl() */ |
|
116 |
|
117 #ifdef __hpux |
|
118 # include <sys/param.h> |
|
119 @@ -420,8 +438,10 @@ |
|
120 local char *prog; /* name by which pigz was invoked */ |
|
121 local int ind; /* input file descriptor */ |
|
122 local int outd; /* output file descriptor */ |
|
123 +local int idxd; /* index file descriptor */ |
|
124 local char in[PATH_MAX+1]; /* input file name (accommodate recursion) */ |
|
125 local char *out = NULL; /* output file name (allocated if not NULL) */ |
|
126 +local char *index = NULL; /* index file name template (may have %f, %z) */ |
|
127 local int verbosity; /* 0 = quiet, 1 = normal, 2 = verbose, 3 = trace */ |
|
128 local int headis; /* 1 to store name, 2 to store date, 3 both */ |
|
129 local int pipeout; /* write output to stdout even if file */ |
|
130 @@ -467,9 +487,12 @@ |
|
131 return 0; |
|
132 } |
|
133 |
|
134 +local void idx_abort(void); |
|
135 + |
|
136 /* exit with error, delete output file if in the middle of writing it */ |
|
137 local int bail(char *why, char *what) |
|
138 { |
|
139 + idx_abort(); |
|
140 if (outd != -1 && out != NULL) |
|
141 unlink(out); |
|
142 complain("abort: %s%s", why, what); |
|
143 @@ -684,11 +707,23 @@ |
|
144 return dos; |
|
145 } |
|
146 |
|
147 -/* put a 4-byte integer into a byte array in LSB order or MSB order */ |
|
148 +/* put integers into a byte array in LSB order or MSB order */ |
|
149 #define PUT2L(a,b) (*(a)=(b)&0xff,(a)[1]=(b)>>8) |
|
150 #define PUT4L(a,b) (PUT2L(a,(b)&0xffff),PUT2L((a)+2,(b)>>16)) |
|
151 +#define PUT8L(a,b) (PUT4L(a,(b)&0xffffffff),PUT4L((a)+4,(b)>>32)) |
|
152 #define PUT4M(a,b) (*(a)=(b)>>24,(a)[1]=(b)>>16,(a)[2]=(b)>>8,(a)[3]=(b)) |
|
153 |
|
154 +/* pull LSB order or MSB order integers from an unsigned char buffer */ |
|
155 +#define PULL2L(p) ((p)[0] + ((unsigned)((p)[1]) << 8)) |
|
156 +#define PULL4L(p) (PULL2L(p) + ((unsigned long)(PULL2L((p) + 2)) << 16)) |
|
157 +#define PULL8L(p) ((uint64_t)((p)[0]) | ((uint64_t)((p)[1]) << 8) | \ |
|
158 + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ |
|
159 + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ |
|
160 + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) |
|
161 +#define PULL2M(p) (((unsigned)((p)[0]) << 8) + (p)[1]) |
|
162 +#define PULL4M(p) (((unsigned long)(PULL2M(p)) << 16) + PULL2M((p) + 2)) |
|
163 + |
|
164 + |
|
165 /* write a gzip, zlib, or zip header using the information in the globals */ |
|
166 local unsigned long put_header(void) |
|
167 { |
|
168 @@ -982,7 +1017,7 @@ |
|
169 |
|
170 /* get a space from a pool -- the use count is initially set to one, so there |
|
171 is no need to call use_space() for the first use */ |
|
172 -local struct space *get_space(struct pool *pool) |
|
173 +local struct space *get_space_size(struct pool *pool, size_t size) |
|
174 { |
|
175 struct space *space; |
|
176 |
|
177 @@ -995,6 +1030,15 @@ |
|
178 if (pool->head != NULL) { |
|
179 space = pool->head; |
|
180 possess(space->use); |
|
181 + /* If there's not enough space, free and malloc rather than realloc to |
|
182 + avoid the potential of an unnecessary memory copy. */ |
|
183 + if (space->size < size) { |
|
184 + free(space->buf); |
|
185 + space->buf = malloc(size); |
|
186 + if (space->buf == NULL) |
|
187 + bail("not enough memory", ""); |
|
188 + space->size = size; |
|
189 + } |
|
190 pool->head = space->next; |
|
191 twist(pool->have, BY, -1); /* one less in pool */ |
|
192 twist(space->use, TO, 1); /* initially one user */ |
|
193 @@ -1012,15 +1056,20 @@ |
|
194 if (space == NULL) |
|
195 bail("not enough memory", ""); |
|
196 space->use = new_lock(1); /* initially one user */ |
|
197 - space->buf = malloc(pool->size); |
|
198 + space->buf = malloc(size); |
|
199 if (space->buf == NULL) |
|
200 bail("not enough memory", ""); |
|
201 - space->size = pool->size; |
|
202 + space->size = size; |
|
203 space->len = 0; |
|
204 space->pool = pool; /* remember the pool this belongs to */ |
|
205 return space; |
|
206 } |
|
207 |
|
208 +local struct space *get_space(struct pool *pool) |
|
209 +{ |
|
210 + return get_space_size(pool, pool->size); |
|
211 +} |
|
212 + |
|
213 /* compute next size up by multiplying by about 2**(1/3) and round to the next |
|
214 power of 2 if we're close (so three applications results in doubling) -- if |
|
215 small, go up to at least 16, if overflow, go to max size_t value */ |
|
216 @@ -1109,17 +1158,35 @@ |
|
217 return count; |
|
218 } |
|
219 |
|
220 +/* prompt for permission to overwrite a file */ |
|
221 +local int allow_overwrite(const char *path) |
|
222 +{ |
|
223 + char ch; |
|
224 + int reply = -1; |
|
225 + |
|
226 + fprintf(stderr, "%s exists -- overwrite (y/n)? ", path); |
|
227 + fflush(stderr); |
|
228 + do { |
|
229 + ch = getchar(); |
|
230 + if (reply < 0 && ch != ' ' && ch != '\t') |
|
231 + reply = ch == 'y' || ch == 'Y' ? 1 : 0; |
|
232 + } while (ch != EOF && ch != '\n' && ch != '\r'); |
|
233 + return reply; |
|
234 +} |
|
235 + |
|
236 /* input and output buffer pools */ |
|
237 local struct pool in_pool; |
|
238 local struct pool out_pool; |
|
239 local struct pool dict_pool; |
|
240 local struct pool lens_pool; |
|
241 +local struct pool idx_pool; |
|
242 |
|
243 /* -- parallel compression -- */ |
|
244 |
|
245 /* compress or write job (passed from compress list to write list) -- if seq is |
|
246 equal to -1, compress_thread is instructed to return; if more is false then |
|
247 - this is the last chunk, which after writing tells write_thread to return */ |
|
248 + this is the last chunk, which after writing tells compress_write_thread to |
|
249 + return */ |
|
250 struct job { |
|
251 long seq; /* sequence number */ |
|
252 int more; /* true if this is not the last chunk */ |
|
253 @@ -1166,6 +1233,7 @@ |
|
254 new_pool(&out_pool, OUTPOOL(size), -1); |
|
255 new_pool(&dict_pool, DICT, -1); |
|
256 new_pool(&lens_pool, size >> (RSYNCBITS - 1), -1); |
|
257 + new_pool(&idx_pool, 1, -1); |
|
258 } |
|
259 |
|
260 /* command the compress threads to all return, then join them all (call from |
|
261 @@ -1202,6 +1270,8 @@ |
|
262 Trace(("-- freed %d output buffers", caught)); |
|
263 caught = free_pool(&in_pool); |
|
264 Trace(("-- freed %d input buffers", caught)); |
|
265 + caught = free_pool(&idx_pool); |
|
266 + Trace(("-- freed %d index buffers", caught)); |
|
267 free_lock(write_first); |
|
268 free_lock(compress_have); |
|
269 compress_have = NULL; |
|
270 @@ -1395,18 +1465,483 @@ |
|
271 (void)deflateEnd(&strm); |
|
272 } |
|
273 |
|
274 +/* Block Index |
|
275 + |
|
276 + The block index is an array of idx_entry structs followed by an idx_trailer |
|
277 + struct. They are written to the file in LSB order. The block index can |
|
278 + exist as a standalone file or be appended onto the compressed files. |
|
279 + |
|
280 + The trailer is used to identify a block index. The beginning of the trailer |
|
281 + contains a magic number that is a value too large to be confused with a valid |
|
282 + block length. Aside from backwards P's the magic number looks kinda like |
|
283 + "0xf pigzip 0xf". */ |
|
284 +#define IDXMAGIC 0xf916219f |
|
285 + |
|
286 +struct idx_trailer { |
|
287 + uint32_t magic; |
|
288 + uint64_t count; |
|
289 +}; |
|
290 + |
|
291 +struct idx_entry { |
|
292 + uint32_t infsz; /* inflated size of the block */ |
|
293 + uint32_t defsz; /* deflated size of the block */ |
|
294 + uint32_t check; /* adler32 or crc32 checksum of the block */ |
|
295 +}; |
|
296 + |
|
297 +local struct { |
|
298 + int valid; /* Do the rest of these fields mean anything? */ |
|
299 + |
|
300 + /* An array of entries. References address in space or map */ |
|
301 + struct idx_entry *ents; /* not in right byte order, used for offset */ |
|
302 + uint64_t seq; /* current entry */ |
|
303 + int64_t eof; /* has the last entry been retrieved? */ |
|
304 + |
|
305 + /* When compressing and appending, entries are stored in space->buf. */ |
|
306 + int append; /* is the index at end of compressed file? */ |
|
307 + struct space *space; /* space for storage of index */ |
|
308 + |
|
309 + /* The following are valid only when mmap is used. */ |
|
310 + uchar_t *map; /* mmap'd region containing ents */ |
|
311 + size_t mapsz; /* size of mmap'd region at map */ |
|
312 + off_t mapoff; /* bytes between map and ents */ |
|
313 + |
|
314 + /* Index path, after %f and %z are replaced. */ |
|
315 + char path[PATH_MAX+1]; |
|
316 +} idx; |
|
317 + |
|
318 +/* determines if the two paths refer to the same extant file */ |
|
319 +local int same_file(const char *f1, const char *f2) |
|
320 +{ |
|
321 + struct stat s1; |
|
322 + struct stat s2; |
|
323 + |
|
324 + return (stat(f1, &s1) == 0 && stat(f2, &s2) == 0 && |
|
325 + s1.st_dev == s2.st_dev && s1.st_ino == s2.st_ino); |
|
326 +} |
|
327 + |
|
328 +/* Remove the index file, but only if it is not the same as in or out. |
|
329 + We don't worry about a full cleanup, as this should only be called in an |
|
330 + error path just before exiting. */ |
|
331 +local void idx_abort(void) |
|
332 +{ |
|
333 + if (!idx.valid) |
|
334 + return; |
|
335 + if (idx.path[0] == '\0' || idx.append) |
|
336 + return; |
|
337 + (void) unlink(idx.path); |
|
338 +} |
|
339 + |
|
340 +/* If 0 is returned, a trailer was found and read. Non-zero return means |
|
341 + there was no trailer. Does not exit. Does not change file pointer for fd. */ |
|
342 +local int idx_read_trailer(int fd, char *path, struct idx_trailer *trail) |
|
343 +{ |
|
344 + uchar_t buf[sizeof(*trail)]; |
|
345 + off_t off; |
|
346 + struct stat st; |
|
347 + |
|
348 + if (fd < 0) { |
|
349 + Trace(("%s: index file descriptor %d not valid", path, fd)); |
|
350 + return -1; |
|
351 + } |
|
352 + if (fstat(fd, &st) != 0 || !S_ISREG(st.st_mode)) { |
|
353 + Trace(("%s: index appended to non-regular file", path)); |
|
354 + return -1; |
|
355 + } |
|
356 + off = st.st_size - sizeof(*trail); |
|
357 + if (off < 0) { |
|
358 + Trace(("%s: index file too short for header", path)); |
|
359 + return -1; |
|
360 + } |
|
361 + if (pread(fd, buf, sizeof(buf), off) != sizeof(buf)) { |
|
362 + Trace(("%s: unable to read index trailer", path)); |
|
363 + return -1; |
|
364 + } |
|
365 + trail->magic = PULL4L(buf); |
|
366 + trail->count = PULL8L(buf + 4); |
|
367 + |
|
368 + if (trail->magic != IDXMAGIC) { |
|
369 + Trace(("%s: invalid pigz index magic", path)); |
|
370 + return -1; |
|
371 + } |
|
372 + return 0; |
|
373 +} |
|
374 + |
|
375 +/* Expand a path pattern containing %f and/or %z tokens into a full path. |
|
376 + * Result is stored in idx.path. */ |
|
377 +local int expand_pathpat(char *pathpat) |
|
378 +{ |
|
379 + char *copy = NULL; /* points to in or out global */ |
|
380 + char *suf = NULL; /* suffix (.zz, .gz, etc.) */ |
|
381 + int chop_suffix; |
|
382 + int len; |
|
383 + int i; |
|
384 + int j; |
|
385 + int nag; |
|
386 + |
|
387 + /* Be quiet when opportunistic index use check is being done. */ |
|
388 + nag = ((index == NULL) && strcmp(pathpat, "%z")); |
|
389 + |
|
390 + for (i = 0, j = 0; pathpat[i] && j < sizeof(idx.path); i++) { |
|
391 + if (pathpat[i] != '%') { |
|
392 + idx.path[j++] = pathpat[i]; |
|
393 + continue; |
|
394 + } |
|
395 + i++; |
|
396 + switch (pathpat[i]) { |
|
397 + case '%': /* %% is replaced by % */ |
|
398 + idx.path[j++] = '%'; |
|
399 + continue; |
|
400 + case 'f': /* %f is replaced by uncompressed file name */ |
|
401 + if (decode) { |
|
402 + if (strcmp(out, "<stdout>") != 0) { |
|
403 + copy = out; /* uncompressed file */ |
|
404 + chop_suffix = 0; |
|
405 + break; |
|
406 + } |
|
407 + if (strcmp(in, "<stdin>") != 0) { |
|
408 + copy = in; /* compressed file */ |
|
409 + chop_suffix = 1; |
|
410 + suf = strrchr(in, '.'); |
|
411 + break; |
|
412 + } |
|
413 + if (nag) |
|
414 + complain("file name for %%f unknown"); |
|
415 + return -1; |
|
416 + } |
|
417 + |
|
418 + if (strcmp(out, "<stdout>") != 0) { |
|
419 + copy = out; /* compressed file */ |
|
420 + chop_suffix = 1; |
|
421 + suf = strrchr(out, '.'); |
|
422 + break; |
|
423 + } |
|
424 + if (strcmp(in, "<stdin>") != 0) { |
|
425 + copy = in; /* uncompressed file */ |
|
426 + chop_suffix = 0; |
|
427 + break; |
|
428 + } |
|
429 + if (nag) |
|
430 + complain("file name for %%f unknown"); |
|
431 + return -1; |
|
432 + case 'z': /* %z is replaced by compressed file name */ |
|
433 + chop_suffix = 0; |
|
434 + if (decode) { |
|
435 + if (strcmp(in, "<stdin>") == 0) { |
|
436 + if (nag) |
|
437 + complain("file name for %%z unknown"); |
|
438 + return -1; |
|
439 + } |
|
440 + copy = in; |
|
441 + break; |
|
442 + } |
|
443 + if (strcmp(pathpat, "%z") == 0) { |
|
444 + /* index will be appended onto stdout */ |
|
445 + copy = NULL; |
|
446 + idx.append = 1; |
|
447 + break; |
|
448 + } |
|
449 + if (strcmp(out, "<stdout>") == 0) { |
|
450 + if (nag) |
|
451 + complain("file name for %%z unknown"); |
|
452 + return -1; |
|
453 + } |
|
454 + copy = out; |
|
455 + break; |
|
456 + default: |
|
457 + if (nag) { |
|
458 + complain("invalid %% sequence in index file pattern %s", |
|
459 + pathpat); |
|
460 + } |
|
461 + return -1; |
|
462 + } |
|
463 + |
|
464 + /* pathpat is "%z" and out is stdout */ |
|
465 + if (copy == NULL) |
|
466 + break; |
|
467 + |
|
468 + len = strlen(&idx.path[j]) + strlen(copy); |
|
469 + if (chop_suffix) |
|
470 + len -= strlen(suf); |
|
471 + if (len >= (sizeof(idx.path) - j)) { |
|
472 + if (nag) |
|
473 + complain("index file name too long"); |
|
474 + return -1; |
|
475 + } |
|
476 + (void)strncpy(&idx.path[j], copy, sizeof(idx.path) - j); |
|
477 + j += len; |
|
478 + assert(j <= sizeof(idx.path)); |
|
479 + } |
|
480 + if (j == sizeof(idx.path)) { |
|
481 + idx.path[j-1] = '\0'; |
|
482 + if (nag) |
|
483 + complain("index file \"%s...\" name too long", idx.path); |
|
484 + return -1; |
|
485 + } |
|
486 + idx.path[j] = '\0'; |
|
487 + |
|
488 + if (copy == NULL && idx.append) { |
|
489 + (void)strncpy(idx.path, out, sizeof(idx.path)); |
|
490 + idx.path[sizeof(idx.path) - 1] = '\0'; |
|
491 + } |
|
492 + else { |
|
493 + if (same_file(decode ? out : in, idx.path)) { |
|
494 + if (nag) |
|
495 + complain("index file %s must not be same as uncompressed file", |
|
496 + idx.path); |
|
497 + return -1; |
|
498 + } |
|
499 + |
|
500 + idx.append = same_file(decode ? in : out, idx.path); |
|
501 + } |
|
502 + |
|
503 + if (verbosity > 1) |
|
504 + (void) fprintf(stderr, "index %s ", idx.path); |
|
505 + |
|
506 + return 0; |
|
507 +} |
|
508 + |
|
509 +/* open the index file associated with the current input or output file. */ |
|
510 +local int idx_open(char *pathpat) |
|
511 +{ |
|
512 + int ret; |
|
513 + struct stat st; |
|
514 + |
|
515 + assert(pathpat != NULL); |
|
516 + |
|
517 + memset(&idx, 0, sizeof(idx)); |
|
518 + |
|
519 + setup_jobs(); |
|
520 + |
|
521 + idxd = -1; |
|
522 + |
|
523 + if (expand_pathpat(pathpat) != 0) |
|
524 + return -1; |
|
525 + |
|
526 + if (decode) { /* Uncompress */ |
|
527 + int64_t sz; |
|
528 + int64_t off; |
|
529 + long pagesize; |
|
530 + |
|
531 + /* Position idxd at the first index record to read. */ |
|
532 + if (idx.append) { |
|
533 + struct idx_trailer trail; |
|
534 + |
|
535 + /* uncompressing, index at end of compressed file */ |
|
536 + if (idx_read_trailer(ind, in, &trail) != 0) { |
|
537 + complain("%s: could not read index", in); |
|
538 + return -1; |
|
539 + } |
|
540 + |
|
541 + idxd = dup(ind); |
|
542 + if (fstat(idxd, &st) != 0 || !S_ISREG(st.st_mode)) { |
|
543 + complain("%s: index appended to non-regular file", idx.path); |
|
544 + (void) close(idxd); |
|
545 + return -1; |
|
546 + } |
|
547 + off = st.st_size - sizeof(trail); |
|
548 + sz = trail.count * sizeof(struct idx_entry); |
|
549 + off -= sz; /* offset into file of first idx_entry */ |
|
550 + } else { |
|
551 + /* Uncompressing, index in a different file. */ |
|
552 + if ((idxd = open(idx.path, O_RDONLY)) < 0) { |
|
553 + complain("%s: unable to open index file", idx.path); |
|
554 + return -1; |
|
555 + } |
|
556 + if (fstat(idxd, &st) != 0) { |
|
557 + complain("%s: unable to stat index file", idx.path); |
|
558 + (void) close(idxd); |
|
559 + return -1; |
|
560 + } |
|
561 + off = 0; |
|
562 + } |
|
563 + /* Try to mmap the index file and let the OS manage the space used by |
|
564 + the index entries. The starting offset of must be a multiple of the |
|
565 + page size. The mapping will end at the end of the file. */ |
|
566 + if ((pagesize = sysconf(_SC_PAGESIZE)) > 0) { |
|
567 + off_t moff; /* mmap offset in idxd */ |
|
568 + |
|
569 + /* moff is the beginning of the page containing off */ |
|
570 + moff = off & ~(pagesize -1); |
|
571 + idx.mapsz = st.st_size - moff; |
|
572 + idx.map = mmap(NULL, idx.mapsz, PROT_READ, MAP_PRIVATE, idxd, moff); |
|
573 + if (idx.map != MAP_FAILED) { |
|
574 + (void)close(idxd); |
|
575 + idxd = -1; |
|
576 + |
|
577 + /* set up array for idx_get() */ |
|
578 + idx.ents = (struct idx_entry*)(idx.map + (off & (pagesize -1))); |
|
579 + |
|
580 + idx.valid = 1; |
|
581 + return 0; |
|
582 + } |
|
583 + idx.mapsz = 0; |
|
584 + idx.map = NULL; |
|
585 + } |
|
586 + /* unable to mmap. Ensure idxfd is positioned properly. */ |
|
587 + if (lseek(idxd, off, SEEK_SET) != off) { |
|
588 + complain("%s: unable to seek on index file", idx.path); |
|
589 + return -1; |
|
590 + } |
|
591 + idx.valid = 1; |
|
592 + return 0; |
|
593 + } |
|
594 + |
|
595 + /* compress - entries will be added to idx.space or idxd. */ |
|
596 + if (idx.append) { |
|
597 + idx.space = get_space(&idx_pool); |
|
598 + idx.valid = 1; |
|
599 + return 0; |
|
600 + } |
|
601 + |
|
602 + idxd = open(idx.path, O_WRONLY | O_CREAT | O_TRUNC | (force ? 0 : O_EXCL), |
|
603 + 0600); |
|
604 + if (idxd < 0 && errno == EEXIST && isatty(0) && verbosity && |
|
605 + allow_overwrite(idx.path)) { |
|
606 + idxd = open(idx.path, O_CREAT | O_TRUNC | O_WRONLY, 0600); |
|
607 + if (idxd == -1) { |
|
608 + complain("%s: %s", idx.path, strerror(errno)); |
|
609 + return -1; |
|
610 + } |
|
611 + } |
|
612 + idx.valid = 1; |
|
613 + return 0; |
|
614 +} |
|
615 + |
|
616 +local void idx_get_next(struct idx_entry *entry) |
|
617 +{ |
|
618 + uchar_t buf[sizeof(*entry)]; |
|
619 + uchar_t *base; |
|
620 + |
|
621 + if (idx.ents != NULL) |
|
622 + base = (uchar_t *)&idx.ents[idx.seq]; |
|
623 + else { |
|
624 + readn(idxd, buf, sizeof(buf)); |
|
625 + base = buf; |
|
626 + } |
|
627 + entry->infsz = PULL4L(base); |
|
628 + entry->defsz = PULL4L(base + 4); |
|
629 + entry->check = PULL4L(base + 8); |
|
630 +} |
|
631 + |
|
632 +/* Returns the fields of the next index entry. */ |
|
633 +local void idx_get(uint64_t *inflated, uint64_t *deflated, uint64_t *check, |
|
634 + int *last) |
|
635 +{ |
|
636 + struct idx_trailer *t; |
|
637 + static struct idx_entry entry; /* value from previous call */ |
|
638 + |
|
639 + assert(!idx.eof); |
|
640 + |
|
641 + if (idx.seq == 0) |
|
642 + idx_get_next(&entry); |
|
643 + |
|
644 + *inflated = entry.infsz; |
|
645 + *deflated = entry.defsz; |
|
646 + *check = entry.check; |
|
647 + idx.seq++; |
|
648 + |
|
649 + /* Look for trailer after this. Value retained for next call. */ |
|
650 + idx_get_next(&entry); |
|
651 + |
|
652 + t = (struct idx_trailer *)&entry; |
|
653 + *last = (t->magic == IDXMAGIC); |
|
654 + idx.eof = *last; |
|
655 +} |
|
656 + |
|
657 +local void idx_add(size_t insz, size_t outsz, unsigned long check) |
|
658 +{ |
|
659 + uchar_t buf[sizeof(struct idx_entry)]; |
|
660 + uchar_t *start; |
|
661 + |
|
662 + idx.seq++; |
|
663 + |
|
664 + /* point start at the right buffer, ensuring it is big enough */ |
|
665 + if (idxd != -1) { |
|
666 + start = buf; |
|
667 + } else { |
|
668 + possess(idx.space->use); |
|
669 + while (idx.space->size - idx.space->len < sizeof(struct idx_entry)) |
|
670 + grow_space(idx.space); |
|
671 + start = idx.space->buf + idx.space->len; |
|
672 + } |
|
673 + |
|
674 + /* copy data into buffer */ |
|
675 + PUT4L(start, (uint32_t)insz); |
|
676 + PUT4L(start + 4, (uint32_t)outsz); |
|
677 + PUT4L(start + 8, (uint32_t)check); |
|
678 + |
|
679 + if (idxd != -1) |
|
680 + writen(idxd, buf, sizeof(buf)); |
|
681 + else { |
|
682 + idx.space->len += sizeof(struct idx_entry); |
|
683 + release(idx.space->use); |
|
684 + } |
|
685 +} |
|
686 + |
|
687 +local void idx_close(void) |
|
688 +{ |
|
689 + uchar_t buf[sizeof(struct idx_trailer)]; |
|
690 + |
|
691 + assert(idx.valid); |
|
692 + idx.valid = 0; |
|
693 + |
|
694 + if (decode && !keep && !idx.append) |
|
695 + (void)unlink(idx.path); |
|
696 + |
|
697 + if (idx.map != NULL) { /* uncompressing, using mmap'd index */ |
|
698 + (void)munmap(idx.map, idx.mapsz); |
|
699 + idx.ents = NULL; |
|
700 + return; |
|
701 + } |
|
702 + |
|
703 + if (decode) { /* uncompressing, from a file */ |
|
704 + (void)close(idxd); |
|
705 + idxd = -1; |
|
706 + return; |
|
707 + } |
|
708 + |
|
709 + if (idx.space != NULL) { /* compressing, append to output file */ |
|
710 + writen(outd, idx.space->buf, idx.space->len); |
|
711 + release(idx.space->use); |
|
712 + drop_space(idx.space); |
|
713 + } |
|
714 + |
|
715 + PUT4L(buf, IDXMAGIC); |
|
716 + PUT8L(buf + 4, idx.seq); |
|
717 + |
|
718 + writen(idx.append ? outd : idxd, buf, sizeof(buf)); |
|
719 + |
|
720 + if (idxd != -1) { |
|
721 + (void) close(idxd); |
|
722 + idxd = -1; |
|
723 + } |
|
724 +} |
|
725 + |
|
726 +/* Does the compressed input file have an index appended? */ |
|
727 +local int ind_has_index(void) |
|
728 +{ |
|
729 + struct idx_trailer trail; |
|
730 + |
|
731 + /* Not relevant unless we are uncompressing */ |
|
732 + if (decode == 0) |
|
733 + return (0); |
|
734 + |
|
735 + return (idx_read_trailer(ind, in, &trail) == 0); |
|
736 +} |
|
737 + |
|
738 /* collect the write jobs off of the list in sequence order and write out the |
|
739 compressed data until the last chunk is written -- also write the header and |
|
740 trailer and combine the individual check values of the input buffers */ |
|
741 -local void write_thread(void *dummy) |
|
742 +local void compress_write_thread(void *dummy) |
|
743 { |
|
744 long seq; /* next sequence number looking for */ |
|
745 struct job *job; /* job pulled and working on */ |
|
746 size_t len; /* input length */ |
|
747 + size_t olen; /* output length */ |
|
748 int more; /* true if more chunks to write */ |
|
749 unsigned long head; /* header length */ |
|
750 unsigned long ulen; /* total uncompressed size (overflow ok) */ |
|
751 - unsigned long clen; /* total compressed size (overflow ok) */ |
|
752 + size_t clen; /* total compressed size */ |
|
753 unsigned long check; /* check value of uncompressed data */ |
|
754 |
|
755 (void)dummy; |
|
756 @@ -1430,23 +1965,27 @@ |
|
757 /* update lengths, save uncompressed length for COMB */ |
|
758 more = job->more; |
|
759 len = job->in->len; |
|
760 + olen = job->out->len; |
|
761 drop_space(job->in); |
|
762 ulen += (unsigned long)len; |
|
763 - clen += (unsigned long)(job->out->len); |
|
764 + clen += olen; |
|
765 |
|
766 /* write the compressed data and drop the output buffer */ |
|
767 Trace(("-- writing #%ld", seq)); |
|
768 - writen(outd, job->out->buf, job->out->len); |
|
769 + writen(outd, job->out->buf, olen); |
|
770 drop_space(job->out); |
|
771 Trace(("-- wrote #%ld%s", seq, more ? "" : " (last)")); |
|
772 |
|
773 - /* wait for check calculation to complete, then combine, once |
|
774 - the compress thread is done with the input, release it */ |
|
775 + /* wait for check calculation to complete, then combine */ |
|
776 possess(job->calc); |
|
777 wait_for(job->calc, TO_BE, 1); |
|
778 release(job->calc); |
|
779 check = COMB(check, job->check, len); |
|
780 |
|
781 + /* update the block index */ |
|
782 + if (index) |
|
783 + idx_add(len, olen, job->check); |
|
784 + |
|
785 /* free the job */ |
|
786 free_lock(job->calc); |
|
787 free(job); |
|
788 @@ -1517,7 +2056,7 @@ |
|
789 setup_jobs(); |
|
790 |
|
791 /* start write thread */ |
|
792 - writeth = launch(write_thread, NULL); |
|
793 + writeth = launch(compress_write_thread, NULL); |
|
794 |
|
795 /* read from input and start compress threads (write thread will pick up |
|
796 the output of the compress threads) */ |
|
797 @@ -1913,7 +2452,7 @@ |
|
798 #ifndef NOTHREAD |
|
799 /* if first time in or procs == 1, read a buffer to have something to |
|
800 return, otherwise wait for the previous read job to complete */ |
|
801 - if (procs > 1) { |
|
802 + if (procs > 1 && index == NULL && !ind_has_index()) { |
|
803 /* if first time, fire up the read thread, ask for a read */ |
|
804 if (in_which == -1) { |
|
805 in_which = 1; |
|
806 @@ -1995,12 +2534,6 @@ |
|
807 in_next += togo; \ |
|
808 } while (0) |
|
809 |
|
810 -/* pull LSB order or MSB order integers from an unsigned char buffer */ |
|
811 -#define PULL2L(p) ((p)[0] + ((unsigned)((p)[1]) << 8)) |
|
812 -#define PULL4L(p) (PULL2L(p) + ((unsigned long)(PULL2L((p) + 2)) << 16)) |
|
813 -#define PULL2M(p) (((unsigned)((p)[0]) << 8) + (p)[1]) |
|
814 -#define PULL4M(p) (((unsigned long)(PULL2M(p)) << 16) + PULL2M((p) + 2)) |
|
815 - |
|
816 /* convert MS-DOS date and time to a Unix time, assuming current timezone |
|
817 (you got a better idea?) */ |
|
818 local time_t dos2time(unsigned long dos) |
|
819 @@ -2613,6 +3146,73 @@ |
|
820 return 0; |
|
821 } |
|
822 |
|
823 +local void check_trailer(unsigned long check, off_t clen) |
|
824 +{ |
|
825 + unsigned tmp2; /* used by GET4() */ |
|
826 + unsigned long tmp4; /* used by GET4() */ |
|
827 + unsigned long len; |
|
828 + |
|
829 + /* read and check trailer */ |
|
830 + if (form > 1) { /* zip local trailer (if any) */ |
|
831 + if (form == 3) { /* data descriptor follows */ |
|
832 + /* read original version of data descriptor */ |
|
833 + zip_crc = GET4(); |
|
834 + zip_clen = GET4(); |
|
835 + zip_ulen = GET4(); |
|
836 + if (in_eof) |
|
837 + bail("corrupted zip entry -- missing trailer: ", in); |
|
838 + |
|
839 + /* if crc doesn't match, try info-zip variant with sig */ |
|
840 + if (zip_crc != out_check) { |
|
841 + if (zip_crc != 0x08074b50UL || zip_clen != out_check) |
|
842 + bail("corrupted zip entry -- crc32 mismatch: ", in); |
|
843 + zip_crc = zip_clen; |
|
844 + zip_clen = zip_ulen; |
|
845 + zip_ulen = GET4(); |
|
846 + } |
|
847 + |
|
848 + /* handle incredibly rare cases where crc equals signature */ |
|
849 + else if (zip_crc == 0x08074b50UL && zip_clen == zip_crc && |
|
850 + ((clen & LOW32) != zip_crc || zip_ulen == zip_crc)) { |
|
851 + zip_crc = zip_clen; |
|
852 + zip_clen = zip_ulen; |
|
853 + zip_ulen = GET4(); |
|
854 + } |
|
855 + |
|
856 + /* if second length doesn't match, try 64-bit lengths */ |
|
857 + if (zip_ulen != (out_tot & LOW32)) { |
|
858 + zip_ulen = GET4(); |
|
859 + (void)GET4(); |
|
860 + } |
|
861 + if (in_eof) |
|
862 + bail("corrupted zip entry -- missing trailer: ", in); |
|
863 + } |
|
864 + if (zip_clen != (clen & LOW32) || zip_ulen != (out_tot & LOW32)) |
|
865 + bail("corrupted zip entry -- length mismatch: ", in); |
|
866 + check = zip_crc; |
|
867 + } |
|
868 + else if (form == 1) { /* zlib (big-endian) trailer */ |
|
869 + check = (unsigned long)(GET()) << 24; |
|
870 + check += (unsigned long)(GET()) << 16; |
|
871 + check += (unsigned)(GET()) << 8; |
|
872 + check += GET(); |
|
873 + if (in_eof) |
|
874 + bail("corrupted zlib stream -- missing trailer: ", in); |
|
875 + if (check != out_check) |
|
876 + bail("corrupted zlib stream -- adler32 mismatch: ", in); |
|
877 + } |
|
878 + else { /* gzip trailer */ |
|
879 + check = GET4(); |
|
880 + len = GET4(); |
|
881 + if (in_eof) |
|
882 + bail("corrupted gzip stream -- missing trailer: ", in); |
|
883 + if (check != out_check) |
|
884 + bail("corrupted gzip stream -- crc32 mismatch: ", in); |
|
885 + if (len != (out_tot & LOW32)) |
|
886 + bail("corrupted gzip stream -- length mismatch: ", in); |
|
887 + } |
|
888 +} |
|
889 + |
|
890 /* inflate for decompression or testing -- decompress from ind to outd unless |
|
891 decode != 1, in which case just test ind, and then also list if list != 0; |
|
892 look for and decode multiple, concatenated gzip and/or zlib streams; |
|
893 @@ -2620,10 +3220,8 @@ |
|
894 local void infchk(void) |
|
895 { |
|
896 int ret, cont; |
|
897 - unsigned long check, len; |
|
898 + unsigned long check; |
|
899 z_stream strm; |
|
900 - unsigned tmp2; |
|
901 - unsigned long tmp4; |
|
902 off_t clen; |
|
903 |
|
904 cont = 0; |
|
905 @@ -2653,65 +3251,7 @@ |
|
906 /* compute compressed data length */ |
|
907 clen = in_tot - in_left; |
|
908 |
|
909 - /* read and check trailer */ |
|
910 - if (form > 1) { /* zip local trailer (if any) */ |
|
911 - if (form == 3) { /* data descriptor follows */ |
|
912 - /* read original version of data descriptor */ |
|
913 - zip_crc = GET4(); |
|
914 - zip_clen = GET4(); |
|
915 - zip_ulen = GET4(); |
|
916 - if (in_eof) |
|
917 - bail("corrupted zip entry -- missing trailer: ", in); |
|
918 - |
|
919 - /* if crc doesn't match, try info-zip variant with sig */ |
|
920 - if (zip_crc != out_check) { |
|
921 - if (zip_crc != 0x08074b50UL || zip_clen != out_check) |
|
922 - bail("corrupted zip entry -- crc32 mismatch: ", in); |
|
923 - zip_crc = zip_clen; |
|
924 - zip_clen = zip_ulen; |
|
925 - zip_ulen = GET4(); |
|
926 - } |
|
927 - |
|
928 - /* handle incredibly rare cases where crc equals signature */ |
|
929 - else if (zip_crc == 0x08074b50UL && zip_clen == zip_crc && |
|
930 - ((clen & LOW32) != zip_crc || zip_ulen == zip_crc)) { |
|
931 - zip_crc = zip_clen; |
|
932 - zip_clen = zip_ulen; |
|
933 - zip_ulen = GET4(); |
|
934 - } |
|
935 - |
|
936 - /* if second length doesn't match, try 64-bit lengths */ |
|
937 - if (zip_ulen != (out_tot & LOW32)) { |
|
938 - zip_ulen = GET4(); |
|
939 - (void)GET4(); |
|
940 - } |
|
941 - if (in_eof) |
|
942 - bail("corrupted zip entry -- missing trailer: ", in); |
|
943 - } |
|
944 - if (zip_clen != (clen & LOW32) || zip_ulen != (out_tot & LOW32)) |
|
945 - bail("corrupted zip entry -- length mismatch: ", in); |
|
946 - check = zip_crc; |
|
947 - } |
|
948 - else if (form == 1) { /* zlib (big-endian) trailer */ |
|
949 - check = (unsigned long)(GET()) << 24; |
|
950 - check += (unsigned long)(GET()) << 16; |
|
951 - check += (unsigned)(GET()) << 8; |
|
952 - check += GET(); |
|
953 - if (in_eof) |
|
954 - bail("corrupted zlib stream -- missing trailer: ", in); |
|
955 - if (check != out_check) |
|
956 - bail("corrupted zlib stream -- adler32 mismatch: ", in); |
|
957 - } |
|
958 - else { /* gzip trailer */ |
|
959 - check = GET4(); |
|
960 - len = GET4(); |
|
961 - if (in_eof) |
|
962 - bail("corrupted gzip stream -- missing trailer: ", in); |
|
963 - if (check != out_check) |
|
964 - bail("corrupted gzip stream -- crc32 mismatch: ", in); |
|
965 - if (len != (out_tot & LOW32)) |
|
966 - bail("corrupted gzip stream -- length mismatch: ", in); |
|
967 - } |
|
968 + check_trailer(check, clen); |
|
969 |
|
970 /* show file information if requested */ |
|
971 if (list) { |
|
972 @@ -2731,6 +3271,231 @@ |
|
973 complain("%s OK, has trailing junk which was ignored", in); |
|
974 } |
|
975 |
|
976 +local void uncompress_write_thread(void *dummy) |
|
977 +{ |
|
978 + long seq; /* next sequence number looking for */ |
|
979 + struct job *job; /* job pulled and working on */ |
|
980 + int more; /* true if more chunks to write */ |
|
981 + |
|
982 + (void)dummy; |
|
983 + |
|
984 + seq = 0; |
|
985 + do { |
|
986 + /* get next write job in order */ |
|
987 + possess(write_first); |
|
988 + wait_for(write_first, TO_BE, seq); |
|
989 + job = write_head; |
|
990 + write_head = job->next; |
|
991 + twist(write_first, TO, write_head == NULL ? -1 : write_head->seq); |
|
992 + |
|
993 + /* Checksum has been verified. Accumulate the checksum, write the |
|
994 + output, and free the input and output spaces. While the input space |
|
995 + could be dropped earlier, it is done here to ensure the write queue |
|
996 + doesn't grow without bounds. */ |
|
997 + out_check = COMB(out_check, job->check, job->out->len); |
|
998 + out_tot += job->out->len; |
|
999 + |
|
1000 + Trace(("-- writing #%ld", seq)); |
|
1001 + if (decode == 1) /* don't really write if just checking */ |
|
1002 + writen(outd, job->out->buf, job->out->len); |
|
1003 + drop_space(job->in); |
|
1004 + drop_space(job->out); |
|
1005 + Trace(("-- wrote #%ld%s", seq, job->more ? "" : " (last)")); |
|
1006 + |
|
1007 + more = job->more; |
|
1008 + free(job); |
|
1009 + |
|
1010 + seq++; |
|
1011 + } while (more); |
|
1012 + |
|
1013 + /* verify no more jobs, prepare for next use */ |
|
1014 + possess(compress_have); |
|
1015 + assert(compress_head == NULL && peek_lock(compress_have) == 0); |
|
1016 + release(compress_have); |
|
1017 + possess(write_first); |
|
1018 + assert(write_head == NULL); |
|
1019 + twist(write_first, TO, -1); |
|
1020 +} |
|
1021 + |
|
1022 +local void uncompress_thread(void *dummy) |
|
1023 +{ |
|
1024 + struct job *job; /* job pulled and working on */ |
|
1025 + struct job *here, **prior; /* pointers for inserting in write list */ |
|
1026 + unsigned long check; /* check value of output */ |
|
1027 + z_stream strm; /* deflate stream */ |
|
1028 + int err; /* error from inflate() */ |
|
1029 + long firstcheck; /* the initial checksum value */ |
|
1030 + |
|
1031 + (void)dummy; |
|
1032 + |
|
1033 + strm.zfree = Z_NULL; |
|
1034 + strm.zalloc = Z_NULL; |
|
1035 + strm.opaque = Z_NULL; |
|
1036 + if (inflateInit2(&strm, -15) != Z_OK) |
|
1037 + bail("not enough memory", ""); |
|
1038 + |
|
1039 + firstcheck = CHECK(0, Z_NULL, 0); |
|
1040 + |
|
1041 + /* keep looking for work */ |
|
1042 + for (;;) { |
|
1043 + possess(compress_have); |
|
1044 + wait_for(compress_have, NOT_TO_BE, 0); |
|
1045 + job = compress_head; |
|
1046 + assert(job != NULL); |
|
1047 + if (job->seq == -1) |
|
1048 + break; |
|
1049 + compress_head = job->next; |
|
1050 + if (job->next == NULL) |
|
1051 + compress_tail = &compress_head; |
|
1052 + twist(compress_have, BY, -1); |
|
1053 + |
|
1054 + /* got a job -- buffers have all been allocated to the right size. |
|
1055 + deflate and verify the checksum. */ |
|
1056 + Trace(("-- uncompressing #%ld", job->seq)); |
|
1057 + if (inflateReset2(&strm, -15) != Z_OK) |
|
1058 + bail("stream reset failed: ", strm.msg); |
|
1059 + strm.next_in = job->in->buf; |
|
1060 + strm.avail_in = job->in->len; |
|
1061 + strm.next_out = job->out->buf; |
|
1062 + strm.avail_out = job->out->len; |
|
1063 + |
|
1064 + err = inflate(&strm, Z_SYNC_FLUSH); |
|
1065 + if (err != Z_OK && err != Z_STREAM_END) |
|
1066 + bail("corrupted input -- invalid deflate data: ", strm.msg); |
|
1067 + |
|
1068 + /* It's not strictly necessary to verify the checksum here, but it |
|
1069 + seems nice to get an error about a bad checksum as early as possible |
|
1070 + to wasteful cpu and i/o consumtion. */ |
|
1071 + check = CHECK(firstcheck, job->out->buf, job->out->len); |
|
1072 + if (check != job->check) { |
|
1073 + if (form == 1) |
|
1074 + bail("corrupted zlib stream -- adler32 mismatch: ", in); |
|
1075 + else |
|
1076 + bail("corrupted gzip stream -- crc32 mismatch: ", in); |
|
1077 + } |
|
1078 + |
|
1079 + Trace(("-- uncompressed #%ld%s", job->seq, job->more ? "" : " (last)")); |
|
1080 + |
|
1081 + /* insert write job in list in sorted order, alert write thread */ |
|
1082 + possess(write_first); |
|
1083 + prior = &write_head; |
|
1084 + while ((here = *prior) != NULL) { |
|
1085 + if (here->seq > job->seq) |
|
1086 + break; |
|
1087 + prior = &(here->next); |
|
1088 + } |
|
1089 + job->next = here; |
|
1090 + *prior = job; |
|
1091 + twist(write_first, TO, write_head->seq); |
|
1092 + } |
|
1093 + /* found job with seq == -1 -- free deflate memory and return to join */ |
|
1094 + release(compress_have); |
|
1095 + (void)deflateEnd(&strm); |
|
1096 +} |
|
1097 + |
|
1098 +local void parallel_infchk(void) |
|
1099 +{ |
|
1100 + long seq; /* sequence number */ |
|
1101 + struct job *job; /* job of uncompress, then write */ |
|
1102 + struct space *insp; /* space for job input */ |
|
1103 + struct space *outsp; /* space for job output */ |
|
1104 + size_t fromload; |
|
1105 + uint64_t infsz; /* size after inflate() */ |
|
1106 + uint64_t defsz; /* size before inflate() */ |
|
1107 + uint64_t check; /* checksum */ |
|
1108 + int last = 0; /* is this the last block? */ |
|
1109 + |
|
1110 + /* If the index is useless, don't try to use it. */ |
|
1111 + if (!idx.valid) { |
|
1112 + infchk(); |
|
1113 + return; |
|
1114 + } |
|
1115 + |
|
1116 + if (form > 1) { |
|
1117 + complain("index not supported with zip file ", in); |
|
1118 + infchk(); |
|
1119 + return; |
|
1120 + } |
|
1121 + |
|
1122 + /* if first time or after an option change, setup the job lists */ |
|
1123 + setup_jobs(); |
|
1124 + |
|
1125 + /* start write thread */ |
|
1126 + writeth = launch(uncompress_write_thread, NULL); |
|
1127 + |
|
1128 + /* updated by uncompress_write_thread */ |
|
1129 + out_check = CHECK(0L, Z_NULL, 0); |
|
1130 + out_len = 0; |
|
1131 + out_tot = 0; |
|
1132 + |
|
1133 + for (seq = 0; !last; seq++) { |
|
1134 + /* get the next entry from the index */ |
|
1135 + idx_get(&infsz, &defsz, &check, &last); |
|
1136 + |
|
1137 + job = malloc(sizeof(struct job)); |
|
1138 + if (job == NULL) |
|
1139 + bail("not enough memory", ""); |
|
1140 + job->seq = seq; |
|
1141 + job->more = !last; |
|
1142 + job->in = get_space_size(&in_pool, defsz); |
|
1143 + job->out = get_space_size(&out_pool, infsz); |
|
1144 + job->lens = NULL; |
|
1145 + job->check = check; |
|
1146 + job->calc = NULL; |
|
1147 + job->next = NULL; |
|
1148 + |
|
1149 + /* reading the header cached some data, be sure not to skip it */ |
|
1150 + fromload = (in_left < defsz ? in_left : defsz); |
|
1151 + if (fromload > 0) { |
|
1152 + (void)memcpy(job->in->buf, in_next, fromload); |
|
1153 + in_left -= fromload; |
|
1154 + in_next += fromload; |
|
1155 + } |
|
1156 + if (fromload < defsz) |
|
1157 + readn(ind, job->in->buf + fromload, defsz - fromload); |
|
1158 + job->in->len = defsz; |
|
1159 + job->out->len = infsz; |
|
1160 + |
|
1161 + out_len += infsz; |
|
1162 + |
|
1163 + /* start another uncompress thread if needed */ |
|
1164 + if (cthreads <= seq && cthreads < procs) { |
|
1165 + (void)launch(uncompress_thread, NULL); |
|
1166 + cthreads++; |
|
1167 + } |
|
1168 + |
|
1169 + possess(compress_have); |
|
1170 + *compress_tail = job; |
|
1171 + compress_tail = &(job->next); |
|
1172 + twist(compress_have, BY, +1); |
|
1173 + } |
|
1174 + |
|
1175 + /* wait for the write thread to complete (we leave the compress threads out |
|
1176 + there and waiting in case there is another stream to compress) */ |
|
1177 + join(writeth); |
|
1178 + writeth = NULL; |
|
1179 + Trace(("-- write thread joined")); |
|
1180 + |
|
1181 + check_trailer(out_check, out_len); |
|
1182 +} |
|
1183 + |
|
1184 +/* parallel_infchk() or infchk(), whichever works. */ |
|
1185 +local void best_infchk(void) |
|
1186 +{ |
|
1187 + if (index != NULL) { |
|
1188 + /* User specified index file */ |
|
1189 + if (idx_open(index) != 0) |
|
1190 + bail("invalid index file", ""); |
|
1191 + } |
|
1192 + else if (ind_has_index()) |
|
1193 + (void)idx_open("%z"); |
|
1194 + |
|
1195 + if (idx.valid) |
|
1196 + parallel_infchk(); |
|
1197 + else |
|
1198 + infchk(); |
|
1199 +} |
|
1200 + |
|
1201 /* --- decompress Unix compress (LZW) input --- */ |
|
1202 |
|
1203 /* memory for unlzw() -- |
|
1204 @@ -3159,7 +3924,7 @@ |
|
1205 /* if requested, test input file (possibly a special list) */ |
|
1206 if (decode == 2) { |
|
1207 if (method == 8) |
|
1208 - infchk(); |
|
1209 + best_infchk(); |
|
1210 else { |
|
1211 unlzw(); |
|
1212 if (list) { |
|
1213 @@ -3219,19 +3984,8 @@ |
|
1214 |
|
1215 /* if exists and not -f, give user a chance to overwrite */ |
|
1216 if (outd < 0 && errno == EEXIST && isatty(0) && verbosity) { |
|
1217 - int ch, reply; |
|
1218 - |
|
1219 - fprintf(stderr, "%s exists -- overwrite (y/n)? ", out); |
|
1220 - fflush(stderr); |
|
1221 - reply = -1; |
|
1222 - do { |
|
1223 - ch = getchar(); |
|
1224 - if (reply < 0 && ch != ' ' && ch != '\t') |
|
1225 - reply = ch == 'y' || ch == 'Y' ? 1 : 0; |
|
1226 - } while (ch != EOF && ch != '\n' && ch != '\r'); |
|
1227 - if (reply == 1) |
|
1228 - outd = open(out, O_CREAT | O_TRUNC | O_WRONLY, |
|
1229 - 0600); |
|
1230 + if (allow_overwrite(out)) |
|
1231 + outd = open(out, O_CREAT | O_TRUNC | O_WRONLY, 0600); |
|
1232 } |
|
1233 |
|
1234 /* if exists and no overwrite, report and go on to next */ |
|
1235 @@ -3254,17 +4008,22 @@ |
|
1236 /* process ind to outd */ |
|
1237 if (verbosity > 1) |
|
1238 fprintf(stderr, "%s to %s ", in, out); |
|
1239 + |
|
1240 if (decode) { |
|
1241 - if (method == 8) |
|
1242 - infchk(); |
|
1243 + if (method == 8) { |
|
1244 + best_infchk(); |
|
1245 + } |
|
1246 else if (method == 256) |
|
1247 unlzw(); |
|
1248 else |
|
1249 cat(); |
|
1250 } |
|
1251 #ifndef NOTHREAD |
|
1252 - else if (procs > 1) |
|
1253 + else if (procs > 1) { |
|
1254 + if (index != NULL && idx_open(index) != 0) |
|
1255 + bail("invalid index file", ""); |
|
1256 parallel_compress(); |
|
1257 + } |
|
1258 #endif |
|
1259 else |
|
1260 single_compress(0); |
|
1261 @@ -3273,6 +4032,10 @@ |
|
1262 fflush(stderr); |
|
1263 } |
|
1264 |
|
1265 + /* close index file - this may append the index to outd */ |
|
1266 + if (idx.valid) |
|
1267 + idx_close(); |
|
1268 + |
|
1269 /* finish up, copy attributes, set times, delete original */ |
|
1270 if (ind != 0) |
|
1271 close(ind); |
|
1272 @@ -3331,6 +4094,9 @@ |
|
1273 " -v, --verbose Provide more verbose output", |
|
1274 #endif |
|
1275 " -V --version Show the version of pigz", |
|
1276 +" -X --index file Create or use parallel uncompression index file.", |
|
1277 +" %f and %z are replaced by uncompressed and compressed", |
|
1278 +" file names", |
|
1279 " -z, --zlib Compress to zlib (.zz) instead of gzip format", |
|
1280 " -- All arguments after \"--\" are treated as files" |
|
1281 }; |
|
1282 @@ -3400,11 +4166,11 @@ |
|
1283 local char *longopts[][2] = { |
|
1284 {"LZW", "Z"}, {"ascii", "a"}, {"best", "9"}, {"bits", "Z"}, |
|
1285 {"blocksize", "b"}, {"decompress", "d"}, {"fast", "1"}, {"force", "f"}, |
|
1286 - {"help", "h"}, {"independent", "i"}, {"keep", "k"}, {"license", "L"}, |
|
1287 - {"list", "l"}, {"name", "N"}, {"no-name", "n"}, {"no-time", "T"}, |
|
1288 - {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, {"rsyncable", "R"}, |
|
1289 - {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, {"test", "t"}, |
|
1290 - {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, |
|
1291 + {"help", "h"}, {"independent", "i"}, {"index", "X"}, {"keep", "k"}, |
|
1292 + {"license", "L"}, {"list", "l"}, {"name", "N"}, {"no-name", "n"}, |
|
1293 + {"no-time", "T"}, {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, |
|
1294 + {"rsyncable", "R"}, {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, |
|
1295 + {"test", "t"}, {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, |
|
1296 {"version", "V"}, {"zip", "K"}, {"zlib", "z"}}; |
|
1297 #define NLOPTS (sizeof(longopts) / (sizeof(char *) << 1)) |
|
1298 |
|
1299 @@ -3444,7 +4210,7 @@ |
|
1300 |
|
1301 /* if no argument or dash option, check status of get */ |
|
1302 if (get && (arg == NULL || *arg == '-')) { |
|
1303 - bad[1] = "bpS"[get - 1]; |
|
1304 + bad[1] = "bpSX"[get - 1]; |
|
1305 bail("missing parameter after ", bad); |
|
1306 } |
|
1307 if (arg == NULL) |
|
1308 @@ -3503,6 +4269,7 @@ |
|
1309 case 'R': rsync = 1; break; |
|
1310 case 'S': get = 3; break; |
|
1311 case 'V': fputs(VERSION, stderr); exit(0); |
|
1312 + case 'X': setdict = 0; get = 4; break; |
|
1313 case 'Z': |
|
1314 bail("invalid option: LZW output not supported: ", bad); |
|
1315 case 'a': |
|
1316 @@ -3530,7 +4297,7 @@ |
|
1317 return 0; |
|
1318 } |
|
1319 |
|
1320 - /* process option parameter for -b, -p, or -S */ |
|
1321 + /* process option parameter for -b, -p, -S, or -X */ |
|
1322 if (get) { |
|
1323 size_t n; |
|
1324 |
|
1325 @@ -3543,7 +4310,7 @@ |
|
1326 OUTPOOL(size) < size || |
|
1327 (ssize_t)OUTPOOL(size) < 0 || |
|
1328 size > (1UL << 22)) |
|
1329 - bail("block size too large: ", arg); |
|
1330 + bail("block size too large:", arg); |
|
1331 new_opts(); |
|
1332 } |
|
1333 else if (get == 2) { |
|
1334 @@ -3561,6 +4328,9 @@ |
|
1335 } |
|
1336 else if (get == 3) |
|
1337 sufx = arg; /* gz suffix */ |
|
1338 + else if (get == 4) |
|
1339 + index = arg; /* index file */ |
|
1340 + |
|
1341 get = 0; |
|
1342 return 0; |
|
1343 } |