zlib gzip与zip

在运维与web开发中，我经常听说这三个名词，常用的使用环境有编译nginx需要添加zlib库，启用ngx_gzip_static_module,php中增加个zip模块可以用来处理zip文件，web服务器启用gzip压缩可以节省传输宽带和时间，HTTP/1.1协议中Content-Encoding的gzip和defalte等等等，那么这三个东西到底是什么，有什么关系，我想很多人都跟我一样，被搞糊涂了。

先来看一下zlib，它的官方网站是www.zlib.net，目前的最新版本是1.2.11.官方介绍它是一个免费的数据压缩库（A Massively Spiffy Yet Delicately Unobtrusive Compression Library），zlib由两位主要人员开发，Jean-loup Gailly负责压缩功能，Mark Adler负责解压功能。由于代码是开源的，我们不妨把源码下载下来，编译一下,编译之后我们在zlib的安装目录看到了下列文件:

[root@iZrj9hu97fjb3e1xlfktg8Z zlib]# tree
.
├── include
│   ├── zconf.h
│   └── zlib.h
├── lib
│   ├── libz.a
│   ├── libz.so -> libz.so.1.2.11
│   ├── libz.so.1 -> libz.so.1.2.11
│   ├── libz.so.1.2.11
│   └── pkgconfig
│       └── zlib.pc
└── share
    └── man
        └── man3
            └── zlib.3

6 directories, 8 files

仔细观察的话，发现zlib并没有像其它的程序一样，编译出二进制可指定代码，只有头文件，库文件和文档，也就是说zlib并不提供对文件的直接操作。再来会过来头看zlib源代码的zlib.3.pdf文件，它是这样说的：“zlib库是一个通用的数据压缩库，它提供在内存当中的压缩和解压算法，目前只支持deflation一种算法，将来可能会添加其它压缩算法”，也就是说整个zlib实际上只有一种算法，那就是deflation算法。
zlib提供了几个比较简单的函数来压缩和解压数据:

int ZEXPORT compress2 (dest, destLen, source, sourceLen, level)
    Bytef *dest;
    uLongf *destLen;
    const Bytef *source;
    uLong sourceLen;
    int level;
{
    z_stream stream;
    int err;
    const uInt max = (uInt)-1;
    uLong left;

    left = *destLen;
    *destLen = 0;

    stream.zalloc = (alloc_func)0;
    stream.zfree = (free_func)0;
    stream.opaque = (voidpf)0;

    err = deflateInit(&stream, level);
    if (err != Z_OK) return err;

    stream.next_out = dest;
    stream.avail_out = 0;
    stream.next_in = (z_const Bytef *)source;
    stream.avail_in = 0;

    do {
        if (stream.avail_out == 0) {
            stream.avail_out = left > (uLong)max ? max : (uInt)left;
            left -= stream.avail_out;
        }
        if (stream.avail_in == 0) {
            stream.avail_in = sourceLen > (uLong)max ? max : (uInt)sourceLen;
            sourceLen -= stream.avail_in;
        }
        err = deflate(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH);
    } while (err == Z_OK);

    *destLen = stream.total_out;
    deflateEnd(&stream);
    return err == Z_STREAM_END ? Z_OK : err;
}

int ZEXPORT compress (dest, destLen, source, sourceLen)
    Bytef *dest;
    uLongf *destLen;
    const Bytef *source;
    uLong sourceLen;
{
    return compress2(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
}

通过参数，我们可以看到zlib的压缩确实像上面说的，没有涉及到文件操作，不读取硬盘上的数据文件进行压缩，只操作内存数据。那么我们来测试一下用compress函数压缩数据:

#include <stdio.h>
#include <string.h>
#include <zlib.h>
#include <stdlib.h>
int my_write(char* fname,const char * buffer);
int main(int argc,char** argv){
    if(argc == 1){
        printf("Please Input A String You want to Compress\n");
        return -1;
    }
    char * str = argv[1];
    printf("The String You Want to Compress is: %s\n",str);
    //计算需要压缩的字符的长度
    uLong sLen = strlen(str);
    //压缩之后字符的长度
    uLong tLen;
    //通过zlib API计算tlen的长度，以便接下来给压缩之后的数据分配空间
    tLen = compressBound(sLen);
    //分配压缩数据空间
    char * cspace;
    cspace = malloc(tLen);
    //是否分配成功
    if(cspace == NULL){
        printf("Not enough memory!\n");
        return -1;
    }
    //开始压缩
    int result;
    result = compress(cspace,&tLen,str,sLen);
    if(result == Z_OK){
        printf("Compress Sucess!\n");    
        //保存压缩内容到硬盘
        int j = my_write("compressdata.bin",cspace);
        if(j==0){
            printf("\t-Sucess to write into disk!\n");
        }else{
            printf("\t-Failure to write into disk!\n");
        }
    }
    //开始解压
    result = uncompress(str,&sLen,cspace,tLen);
    if(result == Z_OK){
        printf("Original String is: %s\n",str);
    }else{
        printf("uncompress failure!\n");
    }
    free(cspace);
    return 0;

}
int my_write(char* fname,const char * buffer)
{
    size_t writesize;
    FILE *pFile;
    pFile = fopen(fname,"wb");
    //fwrite返回区块数量
    writesize = fwrite(buffer,strlen(buffer),1,pFile);
    fclose(pFile);
    if(strlen(buffer)>0 && 1 == writesize){
        return 0;
    }else{
        return 1;
    }
}

编译运行:

[root@iZrj9hu97fjb3e1xlfktg8Z ~]# gcc zcompress.c  -I /tmp/zlib/include/ -lz -L /tmp/zlib/lib/ -o z
[root@iZrj9hu97fjb3e1xlfktg8Z ~]# ./z "This is the string"
The String You Want to Compress is: This is the string
Compress Sucess!
	-Sucess to write into disk!
Original String is: This is the string

到这里一切正常，那是不是zlib提供就是deflation算法呢？实则不然，zlib还提供了一种zlib数据格式，在原有压缩数据的基础上，添加头部和尾部信息，在HTTP/1.1协议中，content-encoding类型中的deflate，实际上不是deflation算法处理的原始数据，而是添加了头部和尾部信息的ZLIB格式。ZLIB的头部和尾部数据信息可以查看rfc1950。

那么gzip和zlib又有什么区别呢，在gzip的说明文档中，看到了这样的文字:
The deflation algorithm used by zip and gzip is a variation of LZ77
(Lempel-Ziv 1977, see reference below). It finds duplicated strings in
the input data. The second occurrence of a string is replaced by a
pointer to the previous string, in the form of a pair (distance,
length). Distances are limited to 32K bytes, and lengths are limited
to 258 bytes. When a string does not occur anywhere in the previous
32K bytes, it is emitted as a sequence of literal bytes. (In this
description, ‘string’ must be taken as an arbitrary sequence of bytes,
and is not restricted to printable characters.)

…
2. gzip file format

The pkzip format imposes a lot of overhead in various headers, which
are useful for an archiver but not necessary when only one file is
compressed. gzip uses a much simpler structure. Numbers are in little
endian format, and bit 0 is the least significant bit.
A gzip file is a sequence of compressed members. Each member has the
following structure:

2 bytes magic header 0x1f, 0x8b (\037 \213)
1 byte compression method (0..7 reserved, 8 = deflate)
1 byte flags
bit 0 set: file probably ascii text
bit 1 set: continuation of multi-part gzip file
bit 2 set: extra field present
bit 3 set: original file name present
bit 4 set: file comment present
bit 5 set: file is encrypted
bit 6,7: reserved
4 bytes file modification time in Unix format
1 byte extra flags (depend on compression method)
1 byte operating system on which compression took place

2 bytes optional part number (second part=1)
2 bytes optional extra field length
? bytes optional extra field
? bytes optional original file name, zero terminated
? bytes optional file comment, zero terminated
12 bytes optional encryption header
? bytes compressed data
4 bytes crc32
4 bytes uncompressed input size modulo 2^32

也就是说gzip也是使用deflation算法进行数据的压缩，只是存储格式和不一样，另外gzip只能处理当个文件，然zip格式则可以处理目录。

综上:
1，zlib提供一系列函数库，可以采用deflation算法对数据进行压缩，还提供ZLIB DATA FORMAT的跨平台的数据格式。
2，gzip是一个采用deflation算法处理文件的工具，提供比较小的头部和尾部信息，只支持处理单个文件。
3，zip的压缩算法也是采用deflation，它能够处理多个文件或者目录。
4，HTTP协议中的传输编码deflate，传输的数据不是经过deflation压缩的数据，而是zlib格式的数据，即在压缩的基础上，增加zlib头和尾数据。

参考资料

发表回复