svn commit: r194579 - head/usr.bin/gzip

Xin LI delphij at FreeBSD.org
Sun Jun 21 09:39:44 UTC 2009


Author: delphij
Date: Sun Jun 21 09:39:43 2009
New Revision: 194579
URL: http://svn.freebsd.org/changeset/base/194579

Log:
  Add support for uncompressing pack(1)'ed files.  Pack(1) is a program found
  in some commercial Unix systems, which utilizes Huffman minimum redundancy
  code tree to compress files.  This implementation supports the "new" pack
  format only, just like GNU gzip did.
  
  Thanks for oliver@'s archive set which I can test against, and Mingyan Guo
  for providing helpful review of my code.
  
  PR:		bin/109567
  MFC after:	1 month

Added:
  head/usr.bin/gzip/unpack.c   (contents, props changed)
Modified:
  head/usr.bin/gzip/gzip.c

Modified: head/usr.bin/gzip/gzip.c
==============================================================================
--- head/usr.bin/gzip/gzip.c	Sun Jun 21 09:01:12 2009	(r194578)
+++ head/usr.bin/gzip/gzip.c	Sun Jun 21 09:39:43 2009	(r194579)
@@ -79,6 +79,9 @@ enum filetype {
 #ifndef NO_COMPRESS_SUPPORT
 	FT_Z,
 #endif
+#ifndef NO_PACK_SUPPORT
+	FT_PACK,
+#endif
 	FT_LAST,
 	FT_UNKNOWN
 };
@@ -95,6 +98,10 @@ enum filetype {
 #define Z_MAGIC		"\037\235"
 #endif
 
+#ifndef NO_PACK_SUPPORT
+#define PACK_MAGIC	"\037\036"
+#endif
+
 #define GZ_SUFFIX	".gz"
 
 #define BUFLEN		(64 * 1024)
@@ -143,7 +150,7 @@ static suffixes_t suffixes[] = {
 };
 #define NUM_SUFFIXES (sizeof suffixes / sizeof suffixes[0])
 
-static	const char	gzip_version[] = "FreeBSD gzip 20070711";
+static	const char	gzip_version[] = "FreeBSD gzip 20090621";
 
 #ifndef SMALL
 static	const char	gzip_copyright[] = \
@@ -248,6 +255,10 @@ static	FILE 	*zdopen(int);
 static	off_t	zuncompress(FILE *, FILE *, char *, size_t, off_t *);
 #endif
 
+#ifndef NO_PACK_SUPPORT
+static	off_t	unpack(int, int, char *, size_t, off_t *);
+#endif
+
 int main(int, char **p);
 
 #ifdef SMALL
@@ -1104,6 +1115,11 @@ file_gettype(u_char *buf)
 		return FT_Z;
 	else
 #endif
+#ifndef NO_PACK_SUPPORT
+	if (memcmp(buf, PACK_MAGIC, 2) == 0)
+		return FT_PACK;
+	else
+#endif
 		return FT_UNKNOWN;
 }
 
@@ -1458,6 +1474,17 @@ file_uncompress(char *file, char *outfil
 	} else
 #endif
 
+#ifndef NO_PACK_SUPPORT
+	if (method == FT_PACK) {
+		if (lflag) {
+			maybe_warnx("no -l with packed files");
+			goto lose;
+		}
+
+		size = unpack(fd, zfd, NULL, 0, NULL);
+	} else
+#endif
+
 #ifndef SMALL
 	if (method == FT_UNKNOWN) {
 		if (lflag) {
@@ -1651,6 +1678,12 @@ handle_stdin(void)
 		fclose(in);
 		break;
 #endif
+#ifndef NO_PACK_SUPPORT
+	case FT_PACK:
+		usize = unpack(STDIN_FILENO, STDOUT_FILENO,
+			       (char *)header1, sizeof header1, &gsize);
+		break;
+#endif
 	}
 
 #ifndef SMALL
@@ -2038,6 +2071,9 @@ display_version(void)
 #ifndef NO_COMPRESS_SUPPORT
 #include "zuncompress.c"
 #endif
+#ifndef NO_PACK_SUPPORT
+#include "unpack.c"
+#endif
 
 static ssize_t
 read_retry(int fd, void *buf, size_t sz)

Added: head/usr.bin/gzip/unpack.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/usr.bin/gzip/unpack.c	Sun Jun 21 09:39:43 2009	(r194579)
@@ -0,0 +1,322 @@
+/*-
+ * Copyright (c) 2009 Xin LI <delphij at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/* This file is #included by gzip.c */
+
+/*
+ * pack(1) file format:
+ *
+ * The first 7 bytes is the header:
+ *	00, 01 - Signature (US, RS), we already validated it earlier.
+ *	02..05 - Uncompressed size
+ *	    06 - Level for the huffman tree (<=24)
+ *
+ * pack(1) will then store symbols (leaf) nodes count in each huffman
+ * tree levels, each level would consume 1 byte (See [1]).
+ *
+ * After the symbol count table, there is the symbol table, storing
+ * symbols represented by coresponding leaf node.  EOB is not being
+ * explicitly transmitted (not necessary anyway) in the symbol table.
+ *
+ * Compressed data goes after the symbol table.
+ *
+ * NOTES
+ *
+ * [1] If we count EOB into the symbols, that would mean that we will
+ * have at most 256 symbols in the huffman tree.  pack(1) rejects empty
+ * file and files that just repeats one character, which means that we
+ * will have at least 2 symbols.  Therefore, pack(1) would reduce the
+ * last level symbol count by 2 which makes it a number in
+ * range [0..254], so all levels' symbol count would fit into 1 byte.
+ */
+
+#define	PACK_HEADER_LENGTH	7
+#define	HTREE_MAXLEVEL		24
+
+/*
+ * unpack descriptor
+ *
+ * Represent the huffman tree in a similiar way that pack(1) would
+ * store in a packed file.  We store all symbols in a linear table,
+ * and store pointers to each level's first symbol.  In addition to
+ * that, maintain two counts for each level: inner nodes count and
+ * leaf nodes count.
+ */
+typedef struct {
+	int		symbol_size;	/* Size of the symbol table */
+	int		treelevels;	/* Levels for the huffman tree */
+
+	int		*symbolsin;	/* Table of leaf symbols count in
+					   each level */
+	int		*inodesin;	/* Table of internal nodes count in
+					   each level */
+
+	char		*symbol;	/* The symbol table */
+	char		*symbol_eob;	/* Pointer to the EOB symbol */
+	char		**tree;		/* Decoding huffman tree (pointers to
+					   first symbol of each tree level */
+
+	off_t		uncompressed_size; /* Uncompressed size */
+	FILE		*fpIn;		/* Input stream */
+	FILE		*fpOut;		/* Output stream */
+} unpack_descriptor_t;
+
+/*
+ * Release resource allocated to an unpack descriptor.
+ *
+ * Caller is responsible to make sure that all of these pointers are
+ * initialized (in our case, they all point to valid memory block).
+ * We don't zero out pointers here because nobody else would ever
+ * reference the memory block without scrubing them.
+ */
+static void
+unpack_descriptor_fini(unpack_descriptor_t *unpackd)
+{
+
+	free(unpackd->symbolsin);
+	free(unpackd->inodesin);
+	free(unpackd->symbol);
+	free(unpackd->tree);
+
+	fclose(unpackd->fpIn);
+	fclose(unpackd->fpOut);
+}
+
+/*
+ * Recursively fill the internal node count table
+ */
+static void
+unpackd_fill_inodesin(const unpack_descriptor_t *unpackd, int level)
+{
+
+	/*
+	 * The internal nodes would be 1/2 of total internal nodes and
+	 * leaf nodes in the next level.  For the last level there
+	 * would be no internal node by defination.
+	 */
+	if (level < unpackd->treelevels) {
+		unpackd_fill_inodesin(unpackd, level + 1);
+		unpackd->inodesin[level] = (unpackd->inodesin[level + 1] +
+					  unpackd->symbolsin[level + 1]) / 2;
+	} else
+		unpackd->inodesin[level] = 0;
+}
+
+/*
+ * Update counter for accepted bytes
+ */
+static void
+accepted_bytes(off_t *bytes_in, off_t newbytes)
+{
+
+	if (bytes_in != NULL)
+		(*bytes_in) += newbytes;
+}
+
+/*
+ * Read file header and construct the tree.  Also, prepare the buffered I/O
+ * for decode rountine.
+ *
+ * Return value is uncompressed size.
+ */
+static void
+unpack_parse_header(int in, int out, char *pre, size_t prelen, off_t *bytes_in,
+    unpack_descriptor_t *unpackd)
+{
+	unsigned char hdr[PACK_HEADER_LENGTH];	/* buffer for header */
+	ssize_t bytesread;		/* Bytes read from the file */
+	int i, j, thisbyte;
+
+	/* Prepend the header buffer if we already read some data */
+	if (prelen != 0)
+		memcpy(hdr, pre, prelen);
+
+	/* Read in and fill the rest bytes of header */
+	bytesread = read(in, hdr + prelen, PACK_HEADER_LENGTH - prelen);
+	if (bytesread < 0)
+		maybe_err("Error reading pack header");
+
+	accepted_bytes(bytes_in, PACK_HEADER_LENGTH);
+
+	/* Obtain uncompressed length (bytes 2,3,4,5)*/
+	unpackd->uncompressed_size = 0;
+	for (i = 2; i <= 5; i++) {
+		unpackd->uncompressed_size <<= 8;
+		unpackd->uncompressed_size |= hdr[i];
+	}
+
+	/* Get the levels of the tree */
+	unpackd->treelevels = hdr[6];
+	if (unpackd->treelevels > HTREE_MAXLEVEL || unpackd->treelevels < 1)
+		maybe_errx("Huffman tree has insane levels");
+
+	/* Let libc take care for buffering from now on */
+	if ((unpackd->fpIn = fdopen(in, "r")) == NULL)
+		maybe_err("Can not fdopen() input stream");
+	if ((unpackd->fpOut = fdopen(out, "w")) == NULL)
+		maybe_err("Can not fdopen() output stream");
+
+	/* Allocate for the tables of bounds and the tree itself */
+	unpackd->inodesin =
+	    calloc(unpackd->treelevels, sizeof(*(unpackd->inodesin)));
+	unpackd->symbolsin =
+	    calloc(unpackd->treelevels, sizeof(*(unpackd->symbolsin)));
+	unpackd->tree =
+	    calloc(unpackd->treelevels, (sizeof (*(unpackd->tree))));
+	if (unpackd->inodesin == NULL || unpackd->symbolsin == NULL ||
+	    unpackd->tree == NULL)
+		maybe_err("calloc");
+
+	/* We count from 0 so adjust to match array upper bound */
+	unpackd->treelevels--;
+
+	/* Read the levels symbol count table and caculate total */
+	unpackd->symbol_size = 1;		/* EOB */
+	for (i = 0; i <= unpackd->treelevels; i++) {
+		if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
+			maybe_err("File appears to be truncated");
+		unpackd->symbolsin[i] = (unsigned char)thisbyte;
+		unpackd->symbol_size += unpackd->symbolsin[i];
+	}
+	accepted_bytes(bytes_in, unpackd->treelevels);
+	if (unpackd->symbol_size > 256)
+		maybe_errx("Bad symbol table");
+
+	/* Allocate for the symbol table, point symbol_eob at the beginning */
+	unpackd->symbol_eob = unpackd->symbol = calloc(1, unpackd->symbol_size);
+	if (unpackd->symbol == NULL)
+		maybe_err("calloc");
+
+	/*
+	 * Read in the symbol table, which contain [2, 256] symbols.
+	 * In order to fit the count in one byte, pack(1) would offset
+	 * it by reducing 2 from the actual number from the last level.
+	 *
+	 * We adjust the last level's symbol count by 1 here, because
+	 * the EOB symbol is not being transmitted explicitly.  Another
+	 * adjustment would be done later afterward.
+	 */
+	unpackd->symbolsin[unpackd->treelevels]++;
+	for (i = 0; i <= unpackd->treelevels; i++) {
+		unpackd->tree[i] = unpackd->symbol_eob;
+		for (j = 0; j < unpackd->symbolsin[i]; j++) {
+			if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
+				maybe_errx("Symbol table truncated");
+			*unpackd->symbol_eob++ = (char)thisbyte;
+		}
+		accepted_bytes(bytes_in, unpackd->symbolsin[i]);
+	}
+
+	/* Now, take account for the EOB symbol as well */
+	unpackd->symbolsin[unpackd->treelevels]++;
+
+	/*
+	 * The symbolsin table has been constructed now.
+	 * Caculate the internal nodes count table based on it.
+	 */
+	unpackd_fill_inodesin(unpackd, 0);
+}
+
+/*
+ * Decode huffman stream, based on the huffman tree.
+ */
+static void
+unpack_decode(const unpack_descriptor_t *unpackd, off_t *bytes_in)
+{
+	int thislevel, thiscode, thisbyte, inlevelindex;
+	int i;
+	off_t bytes_out = 0;
+	const char *thissymbol;	/* The symbol pointer decoded from stream */
+
+	/*
+	 * Decode huffman.  Fetch every bytes from the file, get it
+	 * into 'thiscode' bit-by-bit, then output the symbol we got
+	 * when one has been found.
+	 *
+	 * Assumption: sizeof(int) > ((max tree levels + 1) / 8).
+	 * bad things could happen if not.
+	 */
+	thislevel = 0;
+	thiscode = thisbyte = 0;
+
+	while ((thisbyte = fgetc(unpackd->fpIn)) != EOF) {
+		accepted_bytes(bytes_in, 1);
+
+		/*
+		 * Split one bit from thisbyte, from highest to lowest,
+		 * feed the bit into thiscode, until we got a symbol from
+		 * the tree.
+		 */
+		for (i = 7; i >= 0; i--) {
+			thiscode = (thiscode << 1) | ((thisbyte >> i) & 1);
+
+			/* Did we got a symbol? (referencing leaf node) */
+			if (thiscode >= unpackd->inodesin[thislevel]) {
+				inlevelindex =
+				    thiscode - unpackd->inodesin[thislevel];
+				if (inlevelindex > unpackd->symbolsin[thislevel])
+					maybe_errx("File corrupt");
+
+				thissymbol =
+				    &(unpackd->tree[thislevel][inlevelindex]);
+				if ((thissymbol == unpackd->symbol_eob) &&
+				    (bytes_out == unpackd->uncompressed_size))
+					goto finished;
+
+				fputc((*thissymbol), unpackd->fpOut);
+				bytes_out++;
+
+				/* Prepare for next input */
+				thislevel = 0; thiscode = 0;
+			} else {
+				thislevel++;
+				if (thislevel > unpackd->treelevels)
+					maybe_errx("File corrupt");
+			}
+		}
+	}
+
+finished:
+	if (bytes_out != unpackd->uncompressed_size)
+		maybe_errx("Premature EOF");
+}
+
+/* Handler for pack(1)'ed file */
+static off_t
+unpack(int in, int out, char *pre, size_t prelen, off_t *bytes_in)
+{
+	unpack_descriptor_t	unpackd;
+
+	unpack_parse_header(dup(in), dup(out), pre, prelen, bytes_in, &unpackd);
+	unpack_decode(&unpackd, bytes_in);
+	unpack_descriptor_fini(&unpackd);
+
+	/* If we reached here, the unpack was successful */
+	return (unpackd.uncompressed_size);
+}
+


More information about the svn-src-head mailing list