1 parallel uncompress - developed by Oracle |
|
2 Offered to upstream at https://github.com/mgerdts/pigz |
|
3 - Branch mt-uncompress-2.2 forked from https://github.com/madler/pigz v. 2.2.6 |
|
4 - Branch mt-uncompress forked from https://github.com/madler/pigz branch master |
|
5 |
|
6 The following generated with: |
|
7 |
|
8 git diff -W 8041c56eca89c427aa0a67f40e92675f3584b4bd \ |
|
9 be138d877c14c5a3f58c67939bf822d83e342947 |
|
10 |
|
11 diff --git a/Makefile b/Makefile |
|
12 index 822902c..0c904a6 100644 |
|
13 --- a/Makefile |
|
14 +++ b/Makefile |
|
15 @@ -44,6 +44,15 @@ test: pigz |
|
16 compress -f < pigz.c | ./unpigz | cmp - pigz.c ;\ |
|
17 fi |
|
18 @rm -f pigz.c.gz pigz.c.zz pigz.c.zip |
|
19 + @rm -rf d/1 d/2 |
|
20 + (mkdir -p d/1; cd d/1; tar xzf ../../../../pigz-2.2.5.tar.gz; \ |
|
21 + cd ..; cp -pr 1 2; ../pigz -rp 4 --index %z 1; \ |
|
22 + ../pigz -drp 4 --index %z 1; diff -r 1 2) |
|
23 + @rm -rf d/1 d/2 |
|
24 + (mkdir -p d/1; cd d/1; tar xzf ../../../../pigz-2.2.5.tar.gz; \ |
|
25 + cd ..; cp -pr 1 2; ../pigz -zrp 4 -X %f.idx 1; \ |
|
26 + ../pigz -dzrp 4 -X %f.idx 1; diff -r 1 2) |
|
27 + @rm -rf d/1 d/2 |
|
28 |
|
29 tests: dev test |
|
30 ./pigzn -kf pigz.c ; ./pigz -t pigz.c.gz |
|
31 diff --git a/pigz.1 b/pigz.1 |
|
32 index 8d75ca2..f47414f 100644 |
|
33 --- a/pigz.1 |
|
34 +++ b/pigz.1 |
|
35 @@ -180,6 +180,14 @@ Provide more verbose output. |
|
36 .B -V --version |
|
37 Show the version of pigz. |
|
38 .TP |
|
39 +.B -X --index file |
|
40 +During compression, create an index that can be used for parallel |
|
41 +decompression. During decompression, use the specified index file for parallel |
|
42 +decompression. Each occurrence of %f and %z are replaced by the uncompressed |
|
43 +and compressed file names, respectively. If the index file is the same file as |
|
44 +the compressed file, the index is written to or read from the end of the |
|
45 +compressed file. |
|
46 +.TP |
|
47 .B -z --zlib |
|
48 Compress to zlib (.zz) instead of gzip format. |
|
49 .TP |
|
50 diff --git a/pigz.c b/pigz.c |
|
51 index ebef63e..5a61315 100644 |
|
52 --- a/pigz.c |
|
53 +++ b/pigz.c |
|
54 @@ -192,13 +192,27 @@ |
|
55 effectiveness of deflating in a single thread. This can be turned off using |
|
56 the --independent or -i option, so that the blocks can be decompressed |
|
57 independently for partial error recovery or for random access. |
|
58 - |
|
59 - Decompression can't be parallelized, at least not without specially prepared |
|
60 - deflate streams for that purpose. As a result, pigz uses a single thread |
|
61 - (the main thread) for decompression, but will create three other threads for |
|
62 - reading, writing, and check calculation, which can speed up decompression |
|
63 - under some circumstances. Parallel decompression can be turned off by |
|
64 - specifying one process (-dp 1 or -tp 1). |
|
65 + |
|
66 + The --index or -X option causes the generation of a block index which can be |
|
67 + used for parallel decompression. The block index can be appended onto the |
|
68 + compressed output or it may be stored in a separate file. The uncompressed |
|
69 + size, compressed size, checksum of each block are stored in the index, |
|
70 + allowing future applications to perform random reads of the compressed file. |
|
71 + Streams generated with -X are readable by legacy versions of pigz and gzip. |
|
72 + |
|
73 + Decompression can be parallelized, but only if a block index is available. |
|
74 + If a block index is not present, pigz uses a single thread (the main thread) |
|
75 + for decompression, but will create three other threads for reading, writing, |
|
76 + and check calculation, which can speed up decompression under some |
|
77 + circumstances. Parallel decompression can be turned off by specifying one |
|
78 + process (-dp 1 or -tp 1). |
|
79 + |
|
80 + If the block index is present, the main thread reads the input file and |
|
81 + dispatches each block to an uncompress thread. The uncompress thread |
|
82 + uncompresses the block, verifies the block checksum, and passes the block |
|
83 + off to a writer thread. The writer thread writes the blocks in order, |
|
84 + and combines the individual block checksums into a per-file checksum. The |
|
85 + per-file checksum is compared to the checksum in the stream's trailer. |
|
86 |
|
87 pigz requires zlib 1.2.1 or later to allow setting the dictionary when doing |
|
88 raw deflate. Since zlib 1.2.3 corrects security vulnerabilities in zlib |
|
89 @@ -260,13 +274,14 @@ |
|
90 can't get way ahead of the write thread and build up a large backlog of |
|
91 unwritten compressed data. The write thread will write the compressed data, |
|
92 drop the output buffer, and then wait for the check value to be unlocked |
|
93 - by the compress thread. Then the write thread combines the check value for |
|
94 - this chunk with the total check value for eventual use in the trailer. If |
|
95 - this is not the last chunk, the write thread then goes back to look for the |
|
96 - next output chunk in sequence. After the last chunk, the write thread |
|
97 - returns and joins the main thread. Unlike the compress threads, a new write |
|
98 - thread is launched for each input stream. The write thread writes the |
|
99 - appropriate header and trailer around the compressed data. |
|
100 + by the compress thread. Then the write thread writes an index entry (if -X) |
|
101 + and combines the check value for this chunk with the total check value for |
|
102 + eventual use in the trailer. If this is not the last chunk, the write thread |
|
103 + then goes back to look for the next output chunk in sequence. After the last |
|
104 + chunk, the write thread returns and joins the main thread. Unlike the |
|
105 + compress threads, a new write thread is launched for each input stream. The |
|
106 + write thread writes the appropriate header and trailer around the compressed |
|
107 + data. |
|
108 |
|
109 The input and output buffers are reused through their collection in pools. |
|
110 Each buffer has a use count, which when decremented to zero returns the |
|
111 @@ -314,6 +329,9 @@ |
|
112 #if __STDC_VERSION__-0 >= 199901L || __GNUC__-0 >= 3 |
|
113 # include <inttypes.h> /* intmax_t */ |
|
114 #endif |
|
115 +#include <stddef.h> /* offsetof() */ |
|
116 +#include <sys/mman.h> /* mmap() */ |
|
117 +#include <netinet/in.h> /* htonl() */ |
|
118 |
|
119 #ifdef __hpux |
|
120 # include <sys/param.h> |
|
121 @@ -421,8 +439,10 @@ |
|
122 local char *prog; /* name by which pigz was invoked */ |
|
123 local int ind; /* input file descriptor */ |
|
124 local int outd; /* output file descriptor */ |
|
125 +local int idxd; /* index file descriptor */ |
|
126 local char in[PATH_MAX+1]; /* input file name (accommodate recursion) */ |
|
127 local char *out = NULL; /* output file name (allocated if not NULL) */ |
|
128 +local char *index = NULL; /* index file name template (may have %f, %z) */ |
|
129 local int verbosity; /* 0 = quiet, 1 = normal, 2 = verbose, 3 = trace */ |
|
130 local int headis; /* 1 to store name, 2 to store date, 3 both */ |
|
131 local int pipeout; /* write output to stdout even if file */ |
|
132 @@ -468,9 +488,12 @@ local int complain(char *fmt, ...) |
|
133 return 0; |
|
134 } |
|
135 |
|
136 +local void idx_abort(void); |
|
137 + |
|
138 /* exit with error, delete output file if in the middle of writing it */ |
|
139 local int bail(char *why, char *what) |
|
140 { |
|
141 + idx_abort(); |
|
142 if (outd != -1 && out != NULL) |
|
143 unlink(out); |
|
144 complain("abort: %s%s", why, what); |
|
145 @@ -685,11 +708,23 @@ local unsigned long time2dos(time_t t) |
|
146 return dos; |
|
147 } |
|
148 |
|
149 -/* put a 4-byte integer into a byte array in LSB order or MSB order */ |
|
150 +/* put integers into a byte array in LSB order or MSB order */ |
|
151 #define PUT2L(a,b) (*(a)=(b)&0xff,(a)[1]=(b)>>8) |
|
152 #define PUT4L(a,b) (PUT2L(a,(b)&0xffff),PUT2L((a)+2,(b)>>16)) |
|
153 +#define PUT8L(a,b) (PUT4L(a,(b)&0xffffffff),PUT4L((a)+4,(b)>>32)) |
|
154 #define PUT4M(a,b) (*(a)=(b)>>24,(a)[1]=(b)>>16,(a)[2]=(b)>>8,(a)[3]=(b)) |
|
155 |
|
156 +/* pull LSB order or MSB order integers from an unsigned char buffer */ |
|
157 +#define PULL2L(p) ((p)[0] + ((unsigned)((p)[1]) << 8)) |
|
158 +#define PULL4L(p) (PULL2L(p) + ((unsigned long)(PULL2L((p) + 2)) << 16)) |
|
159 +#define PULL8L(p) ((uint64_t)((p)[0]) | ((uint64_t)((p)[1]) << 8) | \ |
|
160 + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ |
|
161 + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ |
|
162 + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) |
|
163 +#define PULL2M(p) (((unsigned)((p)[0]) << 8) + (p)[1]) |
|
164 +#define PULL4M(p) (((unsigned long)(PULL2M(p)) << 16) + PULL2M((p) + 2)) |
|
165 + |
|
166 + |
|
167 /* write a gzip, zlib, or zip header using the information in the globals */ |
|
168 local unsigned long put_header(void) |
|
169 { |
|
170 @@ -983,7 +1018,7 @@ local void new_pool(struct pool *pool, size_t size, int limit) |
|
171 |
|
172 /* get a space from a pool -- the use count is initially set to one, so there |
|
173 is no need to call use_space() for the first use */ |
|
174 -local struct space *get_space(struct pool *pool) |
|
175 +local struct space *get_space_size(struct pool *pool, size_t size) |
|
176 { |
|
177 struct space *space; |
|
178 |
|
179 @@ -996,6 +1031,15 @@ local struct space *get_space(struct pool *pool) |
|
180 if (pool->head != NULL) { |
|
181 space = pool->head; |
|
182 possess(space->use); |
|
183 + /* If there's not enough space, free and malloc rather than realloc to |
|
184 + avoid the potential of an unnecessary memory copy. */ |
|
185 + if (space->size < size) { |
|
186 + free(space->buf); |
|
187 + space->buf = malloc(size); |
|
188 + if (space->buf == NULL) |
|
189 + bail("not enough memory", ""); |
|
190 + space->size = size; |
|
191 + } |
|
192 pool->head = space->next; |
|
193 twist(pool->have, BY, -1); /* one less in pool */ |
|
194 twist(space->use, TO, 1); /* initially one user */ |
|
195 @@ -1013,15 +1057,20 @@ local struct space *get_space(struct pool *pool) |
|
196 if (space == NULL) |
|
197 bail("not enough memory", ""); |
|
198 space->use = new_lock(1); /* initially one user */ |
|
199 - space->buf = malloc(pool->size); |
|
200 + space->buf = malloc(size); |
|
201 if (space->buf == NULL) |
|
202 bail("not enough memory", ""); |
|
203 - space->size = pool->size; |
|
204 + space->size = size; |
|
205 space->len = 0; |
|
206 space->pool = pool; /* remember the pool this belongs to */ |
|
207 return space; |
|
208 } |
|
209 |
|
210 +local struct space *get_space(struct pool *pool) |
|
211 +{ |
|
212 + return get_space_size(pool, pool->size); |
|
213 +} |
|
214 + |
|
215 /* compute next size up by multiplying by about 2**(1/3) and round to the next |
|
216 power of 2 if we're close (so three applications results in doubling) -- if |
|
217 small, go up to at least 16, if overflow, go to max size_t value */ |
|
218 @@ -1110,17 +1159,35 @@ local int free_pool(struct pool *pool) |
|
219 return count; |
|
220 } |
|
221 |
|
222 +/* prompt for permission to overwrite a file */ |
|
223 +local int allow_overwrite(const char *path) |
|
224 +{ |
|
225 + char ch; |
|
226 + int reply = -1; |
|
227 + |
|
228 + fprintf(stderr, "%s exists -- overwrite (y/n)? ", path); |
|
229 + fflush(stderr); |
|
230 + do { |
|
231 + ch = getchar(); |
|
232 + if (reply < 0 && ch != ' ' && ch != '\t') |
|
233 + reply = ch == 'y' || ch == 'Y' ? 1 : 0; |
|
234 + } while (ch != EOF && ch != '\n' && ch != '\r'); |
|
235 + return reply; |
|
236 +} |
|
237 + |
|
238 /* input and output buffer pools */ |
|
239 local struct pool in_pool; |
|
240 local struct pool out_pool; |
|
241 local struct pool dict_pool; |
|
242 local struct pool lens_pool; |
|
243 +local struct pool idx_pool; |
|
244 |
|
245 /* -- parallel compression -- */ |
|
246 |
|
247 /* compress or write job (passed from compress list to write list) -- if seq is |
|
248 equal to -1, compress_thread is instructed to return; if more is false then |
|
249 - this is the last chunk, which after writing tells write_thread to return */ |
|
250 + this is the last chunk, which after writing tells compress_write_thread to |
|
251 + return */ |
|
252 struct job { |
|
253 long seq; /* sequence number */ |
|
254 int more; /* true if this is not the last chunk */ |
|
255 @@ -1167,6 +1234,7 @@ local void setup_jobs(void) |
|
256 new_pool(&out_pool, OUTPOOL(size), -1); |
|
257 new_pool(&dict_pool, DICT, -1); |
|
258 new_pool(&lens_pool, size >> (RSYNCBITS - 1), -1); |
|
259 + new_pool(&idx_pool, 1, -1); |
|
260 } |
|
261 |
|
262 /* command the compress threads to all return, then join them all (call from |
|
263 @@ -1203,6 +1271,8 @@ local void finish_jobs(void) |
|
264 Trace(("-- freed %d output buffers", caught)); |
|
265 caught = free_pool(&in_pool); |
|
266 Trace(("-- freed %d input buffers", caught)); |
|
267 + caught = free_pool(&idx_pool); |
|
268 + Trace(("-- freed %d index buffers", caught)); |
|
269 free_lock(write_first); |
|
270 free_lock(compress_have); |
|
271 compress_have = NULL; |
|
272 @@ -1396,18 +1466,483 @@ local void compress_thread(void *dummy) |
|
273 (void)deflateEnd(&strm); |
|
274 } |
|
275 |
|
276 +/* Block Index |
|
277 + |
|
278 + The block index is an array of idx_entry structs followed by an idx_trailer |
|
279 + struct. They are written to the file in LSB order. The block index can |
|
280 + exist as a standalone file or be appended onto the compressed files. |
|
281 + |
|
282 + The trailer is used to identify a block index. The beginning of the trailer |
|
283 + contains a magic number that is a value too large to be confused with a valid |
|
284 + block length. Aside from backwards P's the magic number looks kinda like |
|
285 + "0xf pigzip 0xf". */ |
|
286 +#define IDXMAGIC 0xf916219f |
|
287 + |
|
288 +struct idx_trailer { |
|
289 + uint32_t magic; |
|
290 + uint64_t count; |
|
291 +}; |
|
292 + |
|
293 +struct idx_entry { |
|
294 + uint32_t infsz; /* inflated size of the block */ |
|
295 + uint32_t defsz; /* deflated size of the block */ |
|
296 + uint32_t check; /* adler32 or crc32 checksum of the block */ |
|
297 +}; |
|
298 + |
|
299 +local struct { |
|
300 + int valid; /* Do the rest of these fields mean anything? */ |
|
301 + |
|
302 + /* An array of entries. References address in space or map */ |
|
303 + struct idx_entry *ents; /* not in right byte order, used for offset */ |
|
304 + uint64_t seq; /* current entry */ |
|
305 + int64_t eof; /* has the last entry been retrieved? */ |
|
306 + |
|
307 + /* When compressing and appending, entries are stored in space->buf. */ |
|
308 + int append; /* is the index at end of compressed file? */ |
|
309 + struct space *space; /* space for storage of index */ |
|
310 + |
|
311 + /* The following are valid only when mmap is used. */ |
|
312 + uchar_t *map; /* mmap'd region containing ents */ |
|
313 + size_t mapsz; /* size of mmap'd region at map */ |
|
314 + off_t mapoff; /* bytes between map and ents */ |
|
315 + |
|
316 + /* Index path, after %f and %z are replaced. */ |
|
317 + char path[PATH_MAX+1]; |
|
318 +} idx; |
|
319 + |
|
320 +/* determines if the two paths refer to the same extant file */ |
|
321 +local int same_file(const char *f1, const char *f2) |
|
322 +{ |
|
323 + struct stat s1; |
|
324 + struct stat s2; |
|
325 + |
|
326 + return (stat(f1, &s1) == 0 && stat(f2, &s2) == 0 && |
|
327 + s1.st_dev == s2.st_dev && s1.st_ino == s2.st_ino); |
|
328 +} |
|
329 + |
|
330 +/* Remove the index file, but only if it is not the same as in or out. |
|
331 + We don't worry about a full cleanup, as this should only be called in an |
|
332 + error path just before exiting. */ |
|
333 +local void idx_abort(void) |
|
334 +{ |
|
335 + if (!idx.valid) |
|
336 + return; |
|
337 + if (idx.path[0] == '\0' || idx.append) |
|
338 + return; |
|
339 + (void) unlink(idx.path); |
|
340 +} |
|
341 + |
|
342 +/* If 0 is returned, a trailer was found and read. Non-zero return means |
|
343 + there was no trailer. Does not exit. Does not change file pointer for fd. */ |
|
344 +local int idx_read_trailer(int fd, char *path, struct idx_trailer *trail) |
|
345 +{ |
|
346 + uchar_t buf[sizeof(*trail)]; |
|
347 + off_t off; |
|
348 + struct stat st; |
|
349 + |
|
350 + if (fd < 0) { |
|
351 + Trace(("%s: index file descriptor %d not valid", path, fd)); |
|
352 + return -1; |
|
353 + } |
|
354 + if (fstat(fd, &st) != 0 || !S_ISREG(st.st_mode)) { |
|
355 + Trace(("%s: index appended to non-regular file", path)); |
|
356 + return -1; |
|
357 + } |
|
358 + off = st.st_size - sizeof(*trail); |
|
359 + if (off < 0) { |
|
360 + Trace(("%s: index file too short for header", path)); |
|
361 + return -1; |
|
362 + } |
|
363 + if (pread(fd, buf, sizeof(buf), off) != sizeof(buf)) { |
|
364 + Trace(("%s: unable to read index trailer", path)); |
|
365 + return -1; |
|
366 + } |
|
367 + trail->magic = PULL4L(buf); |
|
368 + trail->count = PULL8L(buf + 4); |
|
369 + |
|
370 + if (trail->magic != IDXMAGIC) { |
|
371 + Trace(("%s: invalid pigz index magic", path)); |
|
372 + return -1; |
|
373 + } |
|
374 + return 0; |
|
375 +} |
|
376 + |
|
377 +/* Expand a path pattern containing %f and/or %z tokens into a full path. |
|
378 + * Result is stored in idx.path. */ |
|
379 +local int expand_pathpat(char *pathpat) |
|
380 +{ |
|
381 + char *copy = NULL; /* points to in or out global */ |
|
382 + char *suf = NULL; /* suffix (.zz, .gz, etc.) */ |
|
383 + int chop_suffix; |
|
384 + int len; |
|
385 + int i; |
|
386 + int j; |
|
387 + int nag; |
|
388 + |
|
389 + /* Be quiet when opportunistic index use check is being done. */ |
|
390 + nag = ((index == NULL) && strcmp(pathpat, "%z")); |
|
391 + |
|
392 + for (i = 0, j = 0; pathpat[i] && j < sizeof(idx.path); i++) { |
|
393 + if (pathpat[i] != '%') { |
|
394 + idx.path[j++] = pathpat[i]; |
|
395 + continue; |
|
396 + } |
|
397 + i++; |
|
398 + switch (pathpat[i]) { |
|
399 + case '%': /* %% is replaced by % */ |
|
400 + idx.path[j++] = '%'; |
|
401 + continue; |
|
402 + case 'f': /* %f is replaced by uncompressed file name */ |
|
403 + if (decode) { |
|
404 + if (strcmp(out, "<stdout>") != 0) { |
|
405 + copy = out; /* uncompressed file */ |
|
406 + chop_suffix = 0; |
|
407 + break; |
|
408 + } |
|
409 + if (strcmp(in, "<stdin>") != 0) { |
|
410 + copy = in; /* compressed file */ |
|
411 + chop_suffix = 1; |
|
412 + suf = strrchr(in, '.'); |
|
413 + break; |
|
414 + } |
|
415 + if (nag) |
|
416 + complain("file name for %%f unknown"); |
|
417 + return -1; |
|
418 + } |
|
419 + |
|
420 + if (strcmp(out, "<stdout>") != 0) { |
|
421 + copy = out; /* compressed file */ |
|
422 + chop_suffix = 1; |
|
423 + suf = strrchr(out, '.'); |
|
424 + break; |
|
425 + } |
|
426 + if (strcmp(in, "<stdin>") != 0) { |
|
427 + copy = in; /* uncompressed file */ |
|
428 + chop_suffix = 0; |
|
429 + break; |
|
430 + } |
|
431 + if (nag) |
|
432 + complain("file name for %%f unknown"); |
|
433 + return -1; |
|
434 + case 'z': /* %z is replaced by compressed file name */ |
|
435 + chop_suffix = 0; |
|
436 + if (decode) { |
|
437 + if (strcmp(in, "<stdin>") == 0) { |
|
438 + if (nag) |
|
439 + complain("file name for %%z unknown"); |
|
440 + return -1; |
|
441 + } |
|
442 + copy = in; |
|
443 + break; |
|
444 + } |
|
445 + if (strcmp(pathpat, "%z") == 0) { |
|
446 + /* index will be appended onto stdout */ |
|
447 + copy = NULL; |
|
448 + idx.append = 1; |
|
449 + break; |
|
450 + } |
|
451 + if (strcmp(out, "<stdout>") == 0) { |
|
452 + if (nag) |
|
453 + complain("file name for %%z unknown"); |
|
454 + return -1; |
|
455 + } |
|
456 + copy = out; |
|
457 + break; |
|
458 + default: |
|
459 + if (nag) { |
|
460 + complain("invalid %% sequence in index file pattern %s", |
|
461 + pathpat); |
|
462 + } |
|
463 + return -1; |
|
464 + } |
|
465 + |
|
466 + /* pathpat is "%z" and out is stdout */ |
|
467 + if (copy == NULL) |
|
468 + break; |
|
469 + |
|
470 + len = strlen(&idx.path[j]) + strlen(copy); |
|
471 + if (chop_suffix) |
|
472 + len -= strlen(suf); |
|
473 + if (len >= (sizeof(idx.path) - j)) { |
|
474 + if (nag) |
|
475 + complain("index file name too long"); |
|
476 + return -1; |
|
477 + } |
|
478 + (void)strncpy(&idx.path[j], copy, sizeof(idx.path) - j); |
|
479 + j += len; |
|
480 + assert(j <= sizeof(idx.path)); |
|
481 + } |
|
482 + if (j == sizeof(idx.path)) { |
|
483 + idx.path[j-1] = '\0'; |
|
484 + if (nag) |
|
485 + complain("index file \"%s...\" name too long", idx.path); |
|
486 + return -1; |
|
487 + } |
|
488 + idx.path[j] = '\0'; |
|
489 + |
|
490 + if (copy == NULL && idx.append) { |
|
491 + (void)strncpy(idx.path, out, sizeof(idx.path)); |
|
492 + idx.path[sizeof(idx.path) - 1] = '\0'; |
|
493 + } |
|
494 + else { |
|
495 + if (same_file(decode ? out : in, idx.path)) { |
|
496 + if (nag) |
|
497 + complain("index file %s must not be same as uncompressed file", |
|
498 + idx.path); |
|
499 + return -1; |
|
500 + } |
|
501 + |
|
502 + idx.append = same_file(decode ? in : out, idx.path); |
|
503 + } |
|
504 + |
|
505 + if (verbosity > 1) |
|
506 + (void) fprintf(stderr, "index %s ", idx.path); |
|
507 + |
|
508 + return 0; |
|
509 +} |
|
510 + |
|
511 +/* open the index file associated with the current input or output file. */ |
|
512 +local int idx_open(char *pathpat) |
|
513 +{ |
|
514 + int ret; |
|
515 + struct stat st; |
|
516 + |
|
517 + assert(pathpat != NULL); |
|
518 + |
|
519 + memset(&idx, 0, sizeof(idx)); |
|
520 + |
|
521 + setup_jobs(); |
|
522 + |
|
523 + idxd = -1; |
|
524 + |
|
525 + if (expand_pathpat(pathpat) != 0) |
|
526 + return -1; |
|
527 + |
|
528 + if (decode) { /* Uncompress */ |
|
529 + int64_t sz; |
|
530 + int64_t off; |
|
531 + long pagesize; |
|
532 + |
|
533 + /* Position idxd at the first index record to read. */ |
|
534 + if (idx.append) { |
|
535 + struct idx_trailer trail; |
|
536 + |
|
537 + /* uncompressing, index at end of compressed file */ |
|
538 + if (idx_read_trailer(ind, in, &trail) != 0) { |
|
539 + complain("%s: could not read index", in); |
|
540 + return -1; |
|
541 + } |
|
542 + |
|
543 + idxd = dup(ind); |
|
544 + if (fstat(idxd, &st) != 0 || !S_ISREG(st.st_mode)) { |
|
545 + complain("%s: index appended to non-regular file", idx.path); |
|
546 + (void) close(idxd); |
|
547 + return -1; |
|
548 + } |
|
549 + off = st.st_size - sizeof(trail); |
|
550 + sz = trail.count * sizeof(struct idx_entry); |
|
551 + off -= sz; /* offset into file of first idx_entry */ |
|
552 + } else { |
|
553 + /* Uncompressing, index in a different file. */ |
|
554 + if ((idxd = open(idx.path, O_RDONLY)) < 0) { |
|
555 + complain("%s: unable to open index file", idx.path); |
|
556 + return -1; |
|
557 + } |
|
558 + if (fstat(idxd, &st) != 0) { |
|
559 + complain("%s: unable to stat index file", idx.path); |
|
560 + (void) close(idxd); |
|
561 + return -1; |
|
562 + } |
|
563 + off = 0; |
|
564 + } |
|
565 + /* Try to mmap the index file and let the OS manage the space used by |
|
566 + the index entries. The starting offset of must be a multiple of the |
|
567 + page size. The mapping will end at the end of the file. */ |
|
568 + if ((pagesize = sysconf(_SC_PAGESIZE)) > 0) { |
|
569 + off_t moff; /* mmap offset in idxd */ |
|
570 + |
|
571 + /* moff is the beginning of the page containing off */ |
|
572 + moff = off & ~(pagesize -1); |
|
573 + idx.mapsz = st.st_size - moff; |
|
574 + idx.map = mmap(NULL, idx.mapsz, PROT_READ, MAP_PRIVATE, idxd, moff); |
|
575 + if (idx.map != MAP_FAILED) { |
|
576 + (void)close(idxd); |
|
577 + idxd = -1; |
|
578 + |
|
579 + /* set up array for idx_get() */ |
|
580 + idx.ents = (struct idx_entry*)(idx.map + (off & (pagesize -1))); |
|
581 + |
|
582 + idx.valid = 1; |
|
583 + return 0; |
|
584 + } |
|
585 + idx.mapsz = 0; |
|
586 + idx.map = NULL; |
|
587 + } |
|
588 + /* unable to mmap. Ensure idxfd is positioned properly. */ |
|
589 + if (lseek(idxd, off, SEEK_SET) != off) { |
|
590 + complain("%s: unable to seek on index file", idx.path); |
|
591 + return -1; |
|
592 + } |
|
593 + idx.valid = 1; |
|
594 + return 0; |
|
595 + } |
|
596 + |
|
597 + /* compress - entries will be added to idx.space or idxd. */ |
|
598 + if (idx.append) { |
|
599 + idx.space = get_space(&idx_pool); |
|
600 + idx.valid = 1; |
|
601 + return 0; |
|
602 + } |
|
603 + |
|
604 + idxd = open(idx.path, O_WRONLY | O_CREAT | O_TRUNC | (force ? 0 : O_EXCL), |
|
605 + 0600); |
|
606 + if (idxd < 0 && errno == EEXIST && isatty(0) && verbosity && |
|
607 + allow_overwrite(idx.path)) { |
|
608 + idxd = open(idx.path, O_CREAT | O_TRUNC | O_WRONLY, 0600); |
|
609 + if (idxd == -1) { |
|
610 + complain("%s: %s", idx.path, strerror(errno)); |
|
611 + return -1; |
|
612 + } |
|
613 + } |
|
614 + idx.valid = 1; |
|
615 + return 0; |
|
616 +} |
|
617 + |
|
618 +local void idx_get_next(struct idx_entry *entry) |
|
619 +{ |
|
620 + uchar_t buf[sizeof(*entry)]; |
|
621 + uchar_t *base; |
|
622 + |
|
623 + if (idx.ents != NULL) |
|
624 + base = (uchar_t *)&idx.ents[idx.seq]; |
|
625 + else { |
|
626 + readn(idxd, buf, sizeof(buf)); |
|
627 + base = buf; |
|
628 + } |
|
629 + entry->infsz = PULL4L(base); |
|
630 + entry->defsz = PULL4L(base + 4); |
|
631 + entry->check = PULL4L(base + 8); |
|
632 +} |
|
633 + |
|
634 +/* Returns the fields of the next index entry. */ |
|
635 +local void idx_get(uint64_t *inflated, uint64_t *deflated, uint64_t *check, |
|
636 + int *last) |
|
637 +{ |
|
638 + struct idx_trailer *t; |
|
639 + static struct idx_entry entry; /* value from previous call */ |
|
640 + |
|
641 + assert(!idx.eof); |
|
642 + |
|
643 + if (idx.seq == 0) |
|
644 + idx_get_next(&entry); |
|
645 + |
|
646 + *inflated = entry.infsz; |
|
647 + *deflated = entry.defsz; |
|
648 + *check = entry.check; |
|
649 + idx.seq++; |
|
650 + |
|
651 + /* Look for trailer after this. Value retained for next call. */ |
|
652 + idx_get_next(&entry); |
|
653 + |
|
654 + t = (struct idx_trailer *)&entry; |
|
655 + *last = (t->magic == IDXMAGIC); |
|
656 + idx.eof = *last; |
|
657 +} |
|
658 + |
|
659 +local void idx_add(size_t insz, size_t outsz, unsigned long check) |
|
660 +{ |
|
661 + uchar_t buf[sizeof(struct idx_entry)]; |
|
662 + uchar_t *start; |
|
663 + |
|
664 + idx.seq++; |
|
665 + |
|
666 + /* point start at the right buffer, ensuring it is big enough */ |
|
667 + if (idxd != -1) { |
|
668 + start = buf; |
|
669 + } else { |
|
670 + possess(idx.space->use); |
|
671 + while (idx.space->size - idx.space->len < sizeof(struct idx_entry)) |
|
672 + grow_space(idx.space); |
|
673 + start = idx.space->buf + idx.space->len; |
|
674 + } |
|
675 + |
|
676 + /* copy data into buffer */ |
|
677 + PUT4L(start, (uint32_t)insz); |
|
678 + PUT4L(start + 4, (uint32_t)outsz); |
|
679 + PUT4L(start + 8, (uint32_t)check); |
|
680 + |
|
681 + if (idxd != -1) |
|
682 + writen(idxd, buf, sizeof(buf)); |
|
683 + else { |
|
684 + idx.space->len += sizeof(struct idx_entry); |
|
685 + release(idx.space->use); |
|
686 + } |
|
687 +} |
|
688 + |
|
689 +local void idx_close(void) |
|
690 +{ |
|
691 + uchar_t buf[sizeof(struct idx_trailer)]; |
|
692 + |
|
693 + assert(idx.valid); |
|
694 + idx.valid = 0; |
|
695 + |
|
696 + if (decode && !keep && !idx.append) |
|
697 + (void)unlink(idx.path); |
|
698 + |
|
699 + if (idx.map != NULL) { /* uncompressing, using mmap'd index */ |
|
700 + (void)munmap(idx.map, idx.mapsz); |
|
701 + idx.ents = NULL; |
|
702 + return; |
|
703 + } |
|
704 + |
|
705 + if (decode) { /* uncompressing, from a file */ |
|
706 + (void)close(idxd); |
|
707 + idxd = -1; |
|
708 + return; |
|
709 + } |
|
710 + |
|
711 + if (idx.space != NULL) { /* compressing, append to output file */ |
|
712 + writen(outd, idx.space->buf, idx.space->len); |
|
713 + release(idx.space->use); |
|
714 + drop_space(idx.space); |
|
715 + } |
|
716 + |
|
717 + PUT4L(buf, IDXMAGIC); |
|
718 + PUT8L(buf + 4, idx.seq); |
|
719 + |
|
720 + writen(idx.append ? outd : idxd, buf, sizeof(buf)); |
|
721 + |
|
722 + if (idxd != -1) { |
|
723 + (void) close(idxd); |
|
724 + idxd = -1; |
|
725 + } |
|
726 +} |
|
727 + |
|
728 +/* Does the compressed input file have an index appended? */ |
|
729 +local int ind_has_index(void) |
|
730 +{ |
|
731 + struct idx_trailer trail; |
|
732 + |
|
733 + /* Not relevant unless we are uncompressing */ |
|
734 + if (decode == 0) |
|
735 + return (0); |
|
736 + |
|
737 + return (idx_read_trailer(ind, in, &trail) == 0); |
|
738 +} |
|
739 + |
|
740 /* collect the write jobs off of the list in sequence order and write out the |
|
741 compressed data until the last chunk is written -- also write the header and |
|
742 trailer and combine the individual check values of the input buffers */ |
|
743 -local void write_thread(void *dummy) |
|
744 +local void compress_write_thread(void *dummy) |
|
745 { |
|
746 long seq; /* next sequence number looking for */ |
|
747 struct job *job; /* job pulled and working on */ |
|
748 size_t len; /* input length */ |
|
749 + size_t olen; /* output length */ |
|
750 int more; /* true if more chunks to write */ |
|
751 unsigned long head; /* header length */ |
|
752 unsigned long ulen; /* total uncompressed size (overflow ok) */ |
|
753 - unsigned long clen; /* total compressed size (overflow ok) */ |
|
754 + size_t clen; /* total compressed size */ |
|
755 unsigned long check; /* check value of uncompressed data */ |
|
756 |
|
757 (void)dummy; |
|
758 @@ -1431,23 +1966,27 @@ local void write_thread(void *dummy) |
|
759 /* update lengths, save uncompressed length for COMB */ |
|
760 more = job->more; |
|
761 len = job->in->len; |
|
762 + olen = job->out->len; |
|
763 drop_space(job->in); |
|
764 ulen += (unsigned long)len; |
|
765 - clen += (unsigned long)(job->out->len); |
|
766 + clen += olen; |
|
767 |
|
768 /* write the compressed data and drop the output buffer */ |
|
769 Trace(("-- writing #%ld", seq)); |
|
770 - writen(outd, job->out->buf, job->out->len); |
|
771 + writen(outd, job->out->buf, olen); |
|
772 drop_space(job->out); |
|
773 Trace(("-- wrote #%ld%s", seq, more ? "" : " (last)")); |
|
774 |
|
775 - /* wait for check calculation to complete, then combine, once |
|
776 - the compress thread is done with the input, release it */ |
|
777 + /* wait for check calculation to complete, then combine */ |
|
778 possess(job->calc); |
|
779 wait_for(job->calc, TO_BE, 1); |
|
780 release(job->calc); |
|
781 check = COMB(check, job->check, len); |
|
782 |
|
783 + /* update the block index */ |
|
784 + if (index) |
|
785 + idx_add(len, olen, job->check); |
|
786 + |
|
787 /* free the job */ |
|
788 free_lock(job->calc); |
|
789 free(job); |
|
790 @@ -1518,7 +2057,7 @@ local void parallel_compress(void) |
|
791 setup_jobs(); |
|
792 |
|
793 /* start write thread */ |
|
794 - writeth = launch(write_thread, NULL); |
|
795 + writeth = launch(compress_write_thread, NULL); |
|
796 |
|
797 /* read from input and start compress threads (write thread will pick up |
|
798 the output of the compress threads) */ |
|
799 @@ -1914,7 +2453,7 @@ local size_t load(void) |
|
800 #ifndef NOTHREAD |
|
801 /* if first time in or procs == 1, read a buffer to have something to |
|
802 return, otherwise wait for the previous read job to complete */ |
|
803 - if (procs > 1) { |
|
804 + if (procs > 1 && index == NULL && !ind_has_index()) { |
|
805 /* if first time, fire up the read thread, ask for a read */ |
|
806 if (in_which == -1) { |
|
807 in_which = 1; |
|
808 @@ -1996,12 +2535,6 @@ local void in_init(void) |
|
809 in_next += togo; \ |
|
810 } while (0) |
|
811 |
|
812 -/* pull LSB order or MSB order integers from an unsigned char buffer */ |
|
813 -#define PULL2L(p) ((p)[0] + ((unsigned)((p)[1]) << 8)) |
|
814 -#define PULL4L(p) (PULL2L(p) + ((unsigned long)(PULL2L((p) + 2)) << 16)) |
|
815 -#define PULL2M(p) (((unsigned)((p)[0]) << 8) + (p)[1]) |
|
816 -#define PULL4M(p) (((unsigned long)(PULL2M(p)) << 16) + PULL2M((p) + 2)) |
|
817 - |
|
818 /* convert MS-DOS date and time to a Unix time, assuming current timezone |
|
819 (you got a better idea?) */ |
|
820 local time_t dos2time(unsigned long dos) |
|
821 @@ -2614,6 +3147,73 @@ local int outb(void *desc, unsigned char *buf, unsigned len) |
|
822 return 0; |
|
823 } |
|
824 |
|
825 +local void check_trailer(unsigned long check, off_t clen) |
|
826 +{ |
|
827 + unsigned tmp2; /* used by GET4() */ |
|
828 + unsigned long tmp4; /* used by GET4() */ |
|
829 + unsigned long len; |
|
830 + |
|
831 + /* read and check trailer */ |
|
832 + if (form > 1) { /* zip local trailer (if any) */ |
|
833 + if (form == 3) { /* data descriptor follows */ |
|
834 + /* read original version of data descriptor */ |
|
835 + zip_crc = GET4(); |
|
836 + zip_clen = GET4(); |
|
837 + zip_ulen = GET4(); |
|
838 + if (in_eof) |
|
839 + bail("corrupted zip entry -- missing trailer: ", in); |
|
840 + |
|
841 + /* if crc doesn't match, try info-zip variant with sig */ |
|
842 + if (zip_crc != out_check) { |
|
843 + if (zip_crc != 0x08074b50UL || zip_clen != out_check) |
|
844 + bail("corrupted zip entry -- crc32 mismatch: ", in); |
|
845 + zip_crc = zip_clen; |
|
846 + zip_clen = zip_ulen; |
|
847 + zip_ulen = GET4(); |
|
848 + } |
|
849 + |
|
850 + /* handle incredibly rare cases where crc equals signature */ |
|
851 + else if (zip_crc == 0x08074b50UL && zip_clen == zip_crc && |
|
852 + ((clen & LOW32) != zip_crc || zip_ulen == zip_crc)) { |
|
853 + zip_crc = zip_clen; |
|
854 + zip_clen = zip_ulen; |
|
855 + zip_ulen = GET4(); |
|
856 + } |
|
857 + |
|
858 + /* if second length doesn't match, try 64-bit lengths */ |
|
859 + if (zip_ulen != (out_tot & LOW32)) { |
|
860 + zip_ulen = GET4(); |
|
861 + (void)GET4(); |
|
862 + } |
|
863 + if (in_eof) |
|
864 + bail("corrupted zip entry -- missing trailer: ", in); |
|
865 + } |
|
866 + if (zip_clen != (clen & LOW32) || zip_ulen != (out_tot & LOW32)) |
|
867 + bail("corrupted zip entry -- length mismatch: ", in); |
|
868 + check = zip_crc; |
|
869 + } |
|
870 + else if (form == 1) { /* zlib (big-endian) trailer */ |
|
871 + check = (unsigned long)(GET()) << 24; |
|
872 + check += (unsigned long)(GET()) << 16; |
|
873 + check += (unsigned)(GET()) << 8; |
|
874 + check += GET(); |
|
875 + if (in_eof) |
|
876 + bail("corrupted zlib stream -- missing trailer: ", in); |
|
877 + if (check != out_check) |
|
878 + bail("corrupted zlib stream -- adler32 mismatch: ", in); |
|
879 + } |
|
880 + else { /* gzip trailer */ |
|
881 + check = GET4(); |
|
882 + len = GET4(); |
|
883 + if (in_eof) |
|
884 + bail("corrupted gzip stream -- missing trailer: ", in); |
|
885 + if (check != out_check) |
|
886 + bail("corrupted gzip stream -- crc32 mismatch: ", in); |
|
887 + if (len != (out_tot & LOW32)) |
|
888 + bail("corrupted gzip stream -- length mismatch: ", in); |
|
889 + } |
|
890 +} |
|
891 + |
|
892 /* inflate for decompression or testing -- decompress from ind to outd unless |
|
893 decode != 1, in which case just test ind, and then also list if list != 0; |
|
894 look for and decode multiple, concatenated gzip and/or zlib streams; |
|
895 @@ -2621,10 +3221,8 @@ local int outb(void *desc, unsigned char *buf, unsigned len) |
|
896 local void infchk(void) |
|
897 { |
|
898 int ret, cont; |
|
899 - unsigned long check, len; |
|
900 + unsigned long check; |
|
901 z_stream strm; |
|
902 - unsigned tmp2; |
|
903 - unsigned long tmp4; |
|
904 off_t clen; |
|
905 |
|
906 cont = 0; |
|
907 @@ -2654,65 +3252,7 @@ local void infchk(void) |
|
908 /* compute compressed data length */ |
|
909 clen = in_tot - in_left; |
|
910 |
|
911 - /* read and check trailer */ |
|
912 - if (form > 1) { /* zip local trailer (if any) */ |
|
913 - if (form == 3) { /* data descriptor follows */ |
|
914 - /* read original version of data descriptor */ |
|
915 - zip_crc = GET4(); |
|
916 - zip_clen = GET4(); |
|
917 - zip_ulen = GET4(); |
|
918 - if (in_eof) |
|
919 - bail("corrupted zip entry -- missing trailer: ", in); |
|
920 - |
|
921 - /* if crc doesn't match, try info-zip variant with sig */ |
|
922 - if (zip_crc != out_check) { |
|
923 - if (zip_crc != 0x08074b50UL || zip_clen != out_check) |
|
924 - bail("corrupted zip entry -- crc32 mismatch: ", in); |
|
925 - zip_crc = zip_clen; |
|
926 - zip_clen = zip_ulen; |
|
927 - zip_ulen = GET4(); |
|
928 - } |
|
929 - |
|
930 - /* handle incredibly rare cases where crc equals signature */ |
|
931 - else if (zip_crc == 0x08074b50UL && zip_clen == zip_crc && |
|
932 - ((clen & LOW32) != zip_crc || zip_ulen == zip_crc)) { |
|
933 - zip_crc = zip_clen; |
|
934 - zip_clen = zip_ulen; |
|
935 - zip_ulen = GET4(); |
|
936 - } |
|
937 - |
|
938 - /* if second length doesn't match, try 64-bit lengths */ |
|
939 - if (zip_ulen != (out_tot & LOW32)) { |
|
940 - zip_ulen = GET4(); |
|
941 - (void)GET4(); |
|
942 - } |
|
943 - if (in_eof) |
|
944 - bail("corrupted zip entry -- missing trailer: ", in); |
|
945 - } |
|
946 - if (zip_clen != (clen & LOW32) || zip_ulen != (out_tot & LOW32)) |
|
947 - bail("corrupted zip entry -- length mismatch: ", in); |
|
948 - check = zip_crc; |
|
949 - } |
|
950 - else if (form == 1) { /* zlib (big-endian) trailer */ |
|
951 - check = (unsigned long)(GET()) << 24; |
|
952 - check += (unsigned long)(GET()) << 16; |
|
953 - check += (unsigned)(GET()) << 8; |
|
954 - check += GET(); |
|
955 - if (in_eof) |
|
956 - bail("corrupted zlib stream -- missing trailer: ", in); |
|
957 - if (check != out_check) |
|
958 - bail("corrupted zlib stream -- adler32 mismatch: ", in); |
|
959 - } |
|
960 - else { /* gzip trailer */ |
|
961 - check = GET4(); |
|
962 - len = GET4(); |
|
963 - if (in_eof) |
|
964 - bail("corrupted gzip stream -- missing trailer: ", in); |
|
965 - if (check != out_check) |
|
966 - bail("corrupted gzip stream -- crc32 mismatch: ", in); |
|
967 - if (len != (out_tot & LOW32)) |
|
968 - bail("corrupted gzip stream -- length mismatch: ", in); |
|
969 - } |
|
970 + check_trailer(check, clen); |
|
971 |
|
972 /* show file information if requested */ |
|
973 if (list) { |
|
974 @@ -2732,6 +3272,231 @@ local void infchk(void) |
|
975 complain("%s OK, has trailing junk which was ignored", in); |
|
976 } |
|
977 |
|
978 +local void uncompress_write_thread(void *dummy) |
|
979 +{ |
|
980 + long seq; /* next sequence number looking for */ |
|
981 + struct job *job; /* job pulled and working on */ |
|
982 + int more; /* true if more chunks to write */ |
|
983 + |
|
984 + (void)dummy; |
|
985 + |
|
986 + seq = 0; |
|
987 + do { |
|
988 + /* get next write job in order */ |
|
989 + possess(write_first); |
|
990 + wait_for(write_first, TO_BE, seq); |
|
991 + job = write_head; |
|
992 + write_head = job->next; |
|
993 + twist(write_first, TO, write_head == NULL ? -1 : write_head->seq); |
|
994 + |
|
995 + /* Checksum has been verified. Accumulate the checksum, write the |
|
996 + output, and free the input and output spaces. While the input space |
|
997 + could be dropped earlier, it is done here to ensure the write queue |
|
998 + doesn't grow without bounds. */ |
|
999 + out_check = COMB(out_check, job->check, job->out->len); |
|
1000 + out_tot += job->out->len; |
|
1001 + |
|
1002 + Trace(("-- writing #%ld", seq)); |
|
1003 + if (decode == 1) /* don't really write if just checking */ |
|
1004 + writen(outd, job->out->buf, job->out->len); |
|
1005 + drop_space(job->in); |
|
1006 + drop_space(job->out); |
|
1007 + Trace(("-- wrote #%ld%s", seq, job->more ? "" : " (last)")); |
|
1008 + |
|
1009 + more = job->more; |
|
1010 + free(job); |
|
1011 + |
|
1012 + seq++; |
|
1013 + } while (more); |
|
1014 + |
|
1015 + /* verify no more jobs, prepare for next use */ |
|
1016 + possess(compress_have); |
|
1017 + assert(compress_head == NULL && peek_lock(compress_have) == 0); |
|
1018 + release(compress_have); |
|
1019 + possess(write_first); |
|
1020 + assert(write_head == NULL); |
|
1021 + twist(write_first, TO, -1); |
|
1022 +} |
|
1023 + |
|
1024 +local void uncompress_thread(void *dummy) |
|
1025 +{ |
|
1026 + struct job *job; /* job pulled and working on */ |
|
1027 + struct job *here, **prior; /* pointers for inserting in write list */ |
|
1028 + unsigned long check; /* check value of output */ |
|
1029 + z_stream strm; /* deflate stream */ |
|
1030 + int err; /* error from inflate() */ |
|
1031 + long firstcheck; /* the initial checksum value */ |
|
1032 + |
|
1033 + (void)dummy; |
|
1034 + |
|
1035 + strm.zfree = Z_NULL; |
|
1036 + strm.zalloc = Z_NULL; |
|
1037 + strm.opaque = Z_NULL; |
|
1038 + if (inflateInit2(&strm, -15) != Z_OK) |
|
1039 + bail("not enough memory", ""); |
|
1040 + |
|
1041 + firstcheck = CHECK(0, Z_NULL, 0); |
|
1042 + |
|
1043 + /* keep looking for work */ |
|
1044 + for (;;) { |
|
1045 + possess(compress_have); |
|
1046 + wait_for(compress_have, NOT_TO_BE, 0); |
|
1047 + job = compress_head; |
|
1048 + assert(job != NULL); |
|
1049 + if (job->seq == -1) |
|
1050 + break; |
|
1051 + compress_head = job->next; |
|
1052 + if (job->next == NULL) |
|
1053 + compress_tail = &compress_head; |
|
1054 + twist(compress_have, BY, -1); |
|
1055 + |
|
1056 + /* got a job -- buffers have all been allocated to the right size. |
|
1057 + deflate and verify the checksum. */ |
|
1058 + Trace(("-- uncompressing #%ld", job->seq)); |
|
1059 + if (inflateReset2(&strm, -15) != Z_OK) |
|
1060 + bail("stream reset failed: ", strm.msg); |
|
1061 + strm.next_in = job->in->buf; |
|
1062 + strm.avail_in = job->in->len; |
|
1063 + strm.next_out = job->out->buf; |
|
1064 + strm.avail_out = job->out->len; |
|
1065 + |
|
1066 + err = inflate(&strm, Z_SYNC_FLUSH); |
|
1067 + if (err != Z_OK && err != Z_STREAM_END) |
|
1068 + bail("corrupted input -- invalid deflate data: ", strm.msg); |
|
1069 + |
|
1070 + /* It's not strictly necessary to verify the checksum here, but it |
|
1071 + seems nice to get an error about a bad checksum as early as possible |
|
1072 + to wasteful cpu and i/o consumtion. */ |
|
1073 + check = CHECK(firstcheck, job->out->buf, job->out->len); |
|
1074 + if (check != job->check) { |
|
1075 + if (form == 1) |
|
1076 + bail("corrupted zlib stream -- adler32 mismatch: ", in); |
|
1077 + else |
|
1078 + bail("corrupted gzip stream -- crc32 mismatch: ", in); |
|
1079 + } |
|
1080 + |
|
1081 + Trace(("-- uncompressed #%ld%s", job->seq, job->more ? "" : " (last)")); |
|
1082 + |
|
1083 + /* insert write job in list in sorted order, alert write thread */ |
|
1084 + possess(write_first); |
|
1085 + prior = &write_head; |
|
1086 + while ((here = *prior) != NULL) { |
|
1087 + if (here->seq > job->seq) |
|
1088 + break; |
|
1089 + prior = &(here->next); |
|
1090 + } |
|
1091 + job->next = here; |
|
1092 + *prior = job; |
|
1093 + twist(write_first, TO, write_head->seq); |
|
1094 + } |
|
1095 + /* found job with seq == -1 -- free inflate memory and return to join */ |
|
1096 + release(compress_have); |
|
1097 + (void)inflateEnd(&strm); |
|
1098 +} |
|
1099 + |
|
1100 +local void parallel_infchk(void) |
|
1101 +{ |
|
1102 + long seq; /* sequence number */ |
|
1103 + struct job *job; /* job of uncompress, then write */ |
|
1104 + struct space *insp; /* space for job input */ |
|
1105 + struct space *outsp; /* space for job output */ |
|
1106 + size_t fromload; |
|
1107 + uint64_t infsz; /* size after inflate() */ |
|
1108 + uint64_t defsz; /* size before inflate() */ |
|
1109 + uint64_t check; /* checksum */ |
|
1110 + int last = 0; /* is this the last block? */ |
|
1111 + |
|
1112 + /* If the index is useless, don't try to use it. */ |
|
1113 + if (!idx.valid) { |
|
1114 + infchk(); |
|
1115 + return; |
|
1116 + } |
|
1117 + |
|
1118 + if (form > 1) { |
|
1119 + complain("index not supported with zip file ", in); |
|
1120 + infchk(); |
|
1121 + return; |
|
1122 + } |
|
1123 + |
|
1124 + /* if first time or after an option change, setup the job lists */ |
|
1125 + setup_jobs(); |
|
1126 + |
|
1127 + /* start write thread */ |
|
1128 + writeth = launch(uncompress_write_thread, NULL); |
|
1129 + |
|
1130 + /* updated by uncompress_write_thread */ |
|
1131 + out_check = CHECK(0L, Z_NULL, 0); |
|
1132 + out_len = 0; |
|
1133 + out_tot = 0; |
|
1134 + |
|
1135 + for (seq = 0; !last; seq++) { |
|
1136 + /* get the next entry from the index */ |
|
1137 + idx_get(&infsz, &defsz, &check, &last); |
|
1138 + |
|
1139 + job = malloc(sizeof(struct job)); |
|
1140 + if (job == NULL) |
|
1141 + bail("not enough memory", ""); |
|
1142 + job->seq = seq; |
|
1143 + job->more = !last; |
|
1144 + job->in = get_space_size(&in_pool, defsz); |
|
1145 + job->out = get_space_size(&out_pool, infsz); |
|
1146 + job->lens = NULL; |
|
1147 + job->check = check; |
|
1148 + job->calc = NULL; |
|
1149 + job->next = NULL; |
|
1150 + |
|
1151 + /* reading the header cached some data, be sure not to skip it */ |
|
1152 + fromload = (in_left < defsz ? in_left : defsz); |
|
1153 + if (fromload > 0) { |
|
1154 + (void)memcpy(job->in->buf, in_next, fromload); |
|
1155 + in_left -= fromload; |
|
1156 + in_next += fromload; |
|
1157 + } |
|
1158 + if (fromload < defsz) |
|
1159 + readn(ind, job->in->buf + fromload, defsz - fromload); |
|
1160 + job->in->len = defsz; |
|
1161 + job->out->len = infsz; |
|
1162 + |
|
1163 + out_len += infsz; |
|
1164 + |
|
1165 + /* start another uncompress thread if needed */ |
|
1166 + if (cthreads <= seq && cthreads < procs) { |
|
1167 + (void)launch(uncompress_thread, NULL); |
|
1168 + cthreads++; |
|
1169 + } |
|
1170 + |
|
1171 + possess(compress_have); |
|
1172 + *compress_tail = job; |
|
1173 + compress_tail = &(job->next); |
|
1174 + twist(compress_have, BY, +1); |
|
1175 + } |
|
1176 + |
|
1177 + /* wait for the write thread to complete (we leave the compress threads out |
|
1178 + there and waiting in case there is another stream to compress) */ |
|
1179 + join(writeth); |
|
1180 + writeth = NULL; |
|
1181 + Trace(("-- write thread joined")); |
|
1182 + |
|
1183 + check_trailer(out_check, out_len); |
|
1184 +} |
|
1185 + |
|
1186 +/* parallel_infchk() or infchk(), whichever works. */ |
|
1187 +local void best_infchk(void) |
|
1188 +{ |
|
1189 + if (index != NULL) { |
|
1190 + /* User specified index file */ |
|
1191 + if (idx_open(index) != 0) |
|
1192 + bail("invalid index file", ""); |
|
1193 + } |
|
1194 + else if (ind_has_index()) |
|
1195 + (void)idx_open("%z"); |
|
1196 + |
|
1197 + if (idx.valid) |
|
1198 + parallel_infchk(); |
|
1199 + else |
|
1200 + infchk(); |
|
1201 +} |
|
1202 + |
|
1203 /* --- decompress Unix compress (LZW) input --- */ |
|
1204 |
|
1205 /* memory for unlzw() -- |
|
1206 @@ -3160,7 +3925,7 @@ local void process(char *path) |
|
1207 /* if requested, test input file (possibly a special list) */ |
|
1208 if (decode == 2) { |
|
1209 if (method == 8) |
|
1210 - infchk(); |
|
1211 + best_infchk(); |
|
1212 else { |
|
1213 unlzw(); |
|
1214 if (list) { |
|
1215 @@ -3220,19 +3985,8 @@ local void process(char *path) |
|
1216 |
|
1217 /* if exists and not -f, give user a chance to overwrite */ |
|
1218 if (outd < 0 && errno == EEXIST && isatty(0) && verbosity) { |
|
1219 - int ch, reply; |
|
1220 - |
|
1221 - fprintf(stderr, "%s exists -- overwrite (y/n)? ", out); |
|
1222 - fflush(stderr); |
|
1223 - reply = -1; |
|
1224 - do { |
|
1225 - ch = getchar(); |
|
1226 - if (reply < 0 && ch != ' ' && ch != '\t') |
|
1227 - reply = ch == 'y' || ch == 'Y' ? 1 : 0; |
|
1228 - } while (ch != EOF && ch != '\n' && ch != '\r'); |
|
1229 - if (reply == 1) |
|
1230 - outd = open(out, O_CREAT | O_TRUNC | O_WRONLY, |
|
1231 - 0600); |
|
1232 + if (allow_overwrite(out)) |
|
1233 + outd = open(out, O_CREAT | O_TRUNC | O_WRONLY, 0600); |
|
1234 } |
|
1235 |
|
1236 /* if exists and no overwrite, report and go on to next */ |
|
1237 @@ -3255,17 +4009,21 @@ local void process(char *path) |
|
1238 /* process ind to outd */ |
|
1239 if (verbosity > 1) |
|
1240 fprintf(stderr, "%s to %s ", in, out); |
|
1241 + |
|
1242 if (decode) { |
|
1243 if (method == 8) |
|
1244 - infchk(); |
|
1245 + best_infchk(); |
|
1246 else if (method == 256) |
|
1247 unlzw(); |
|
1248 else |
|
1249 cat(); |
|
1250 } |
|
1251 #ifndef NOTHREAD |
|
1252 - else if (procs > 1) |
|
1253 + else if (index != NULL) { |
|
1254 + if (idx_open(index) != 0) |
|
1255 + bail("invalid index file", ""); |
|
1256 parallel_compress(); |
|
1257 + } |
|
1258 #endif |
|
1259 else |
|
1260 single_compress(0); |
|
1261 @@ -3274,6 +4032,10 @@ local void process(char *path) |
|
1262 fflush(stderr); |
|
1263 } |
|
1264 |
|
1265 + /* close index file - this may append the index to outd */ |
|
1266 + if (idx.valid) |
|
1267 + idx_close(); |
|
1268 + |
|
1269 /* finish up, copy attributes, set times, delete original */ |
|
1270 if (ind != 0) |
|
1271 close(ind); |
|
1272 @@ -3332,6 +4094,9 @@ local char *helptext[] = { |
|
1273 " -v, --verbose Provide more verbose output", |
|
1274 #endif |
|
1275 " -V --version Show the version of pigz", |
|
1276 +" -X --index file Create or use parallel uncompression index file.", |
|
1277 +" %f and %z are replaced by uncompressed and compressed", |
|
1278 +" file names", |
|
1279 " -z, --zlib Compress to zlib (.zz) instead of gzip format", |
|
1280 " -- All arguments after \"--\" are treated as files" |
|
1281 }; |
|
1282 @@ -3401,11 +4166,11 @@ local void defaults(void) |
|
1283 local char *longopts[][2] = { |
|
1284 {"LZW", "Z"}, {"ascii", "a"}, {"best", "9"}, {"bits", "Z"}, |
|
1285 {"blocksize", "b"}, {"decompress", "d"}, {"fast", "1"}, {"force", "f"}, |
|
1286 - {"help", "h"}, {"independent", "i"}, {"keep", "k"}, {"license", "L"}, |
|
1287 - {"list", "l"}, {"name", "N"}, {"no-name", "n"}, {"no-time", "T"}, |
|
1288 - {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, {"rsyncable", "R"}, |
|
1289 - {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, {"test", "t"}, |
|
1290 - {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, |
|
1291 + {"help", "h"}, {"independent", "i"}, {"index", "X"}, {"keep", "k"}, |
|
1292 + {"license", "L"}, {"list", "l"}, {"name", "N"}, {"no-name", "n"}, |
|
1293 + {"no-time", "T"}, {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, |
|
1294 + {"rsyncable", "R"}, {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, |
|
1295 + {"test", "t"}, {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, |
|
1296 {"version", "V"}, {"zip", "K"}, {"zlib", "z"}}; |
|
1297 #define NLOPTS (sizeof(longopts) / (sizeof(char *) << 1)) |
|
1298 |
|
1299 @@ -3445,7 +4210,7 @@ local int option(char *arg) |
|
1300 |
|
1301 /* if no argument or dash option, check status of get */ |
|
1302 if (get && (arg == NULL || *arg == '-')) { |
|
1303 - bad[1] = "bpS"[get - 1]; |
|
1304 + bad[1] = "bpSX"[get - 1]; |
|
1305 bail("missing parameter after ", bad); |
|
1306 } |
|
1307 if (arg == NULL) |
|
1308 @@ -3504,6 +4269,7 @@ local int option(char *arg) |
|
1309 case 'R': rsync = 1; break; |
|
1310 case 'S': get = 3; break; |
|
1311 case 'V': fputs(VERSION, stderr); exit(0); |
|
1312 + case 'X': setdict = 0; get = 4; break; |
|
1313 case 'Z': |
|
1314 bail("invalid option: LZW output not supported: ", bad); |
|
1315 case 'a': |
|
1316 @@ -3531,7 +4297,7 @@ local int option(char *arg) |
|
1317 return 0; |
|
1318 } |
|
1319 |
|
1320 - /* process option parameter for -b, -p, or -S */ |
|
1321 + /* process option parameter for -b, -p, -S, or -X */ |
|
1322 if (get) { |
|
1323 size_t n; |
|
1324 |
|
1325 @@ -3544,7 +4310,7 @@ local int option(char *arg) |
|
1326 OUTPOOL(size) < size || |
|
1327 (ssize_t)OUTPOOL(size) < 0 || |
|
1328 size > (1UL << 22)) |
|
1329 - bail("block size too large: ", arg); |
|
1330 + bail("block size too large:", arg); |
|
1331 new_opts(); |
|
1332 } |
|
1333 else if (get == 2) { |
|
1334 @@ -3562,6 +4328,9 @@ local int option(char *arg) |
|
1335 } |
|
1336 else if (get == 3) |
|
1337 sufx = arg; /* gz suffix */ |
|
1338 + else if (get == 4) |
|
1339 + index = arg; /* index file */ |
|
1340 + |
|
1341 get = 0; |
|
1342 return 0; |
|
1343 } |
|