zlib gzip与zip

 分类: 致知

在运维与web开发中,我经常听说这三个名词,常用的使用环境有编译nginx需要添加zlib库,启用ngx_gzip_static_module,php中增加个zip模块可以用来处理zip文件,web服务器启用gzip压缩可以节省传输宽带和时间,HTTP/1.1协议中Content-Encoding的gzip和defalte等等等,那么这三个东西到底是什么,有什么关系,我想很多人都跟我一样,被搞糊涂了。

先来看一下zlib,它的官方网站是www.zlib.net,目前的最新版本是1.2.11.官方介绍它是一个免费的数据压缩库(A Massively Spiffy Yet Delicately Unobtrusive Compression Library),zlib由两位主要人员开发,Jean-loup Gailly负责压缩功能,Mark Adler负责解压功能。由于代码是开源的,我们不妨把源码下载下来,编译一下,编译之后我们在zlib的安装目录看到了下列文件:

[root@iZrj9hu97fjb3e1xlfktg8Z zlib]# tree
.
├── include
│   ├── zconf.h
│   └── zlib.h
├── lib
│   ├── libz.a
│   ├── libz.so -> libz.so.1.2.11
│   ├── libz.so.1 -> libz.so.1.2.11
│   ├── libz.so.1.2.11
│   └── pkgconfig
│       └── zlib.pc
└── share
    └── man
        └── man3
            └── zlib.3

6 directories, 8 files

仔细观察的话,发现zlib并没有像其它的程序一样,编译出二进制可指定代码,只有头文件,库文件和文档,也就是说zlib并不提供对文件的直接操作。再来会过来头看zlib源代码的zlib.3.pdf文件,它是这样说的:“zlib库是一个通用的数据压缩库,它提供在内存当中的压缩和解压算法,目前只支持deflation一种算法,将来可能会添加其它压缩算法”,也就是说整个zlib实际上只有一种算法,那就是deflation算法。
zlib提供了几个比较简单的函数来压缩和解压数据:

int ZEXPORT compress2 (dest, destLen, source, sourceLen, level)
    Bytef *dest;
    uLongf *destLen;
    const Bytef *source;
    uLong sourceLen;
    int level;
{
    z_stream stream;
    int err;
    const uInt max = (uInt)-1;
    uLong left;

    left = *destLen;
    *destLen = 0;

    stream.zalloc = (alloc_func)0;
    stream.zfree = (free_func)0;
    stream.opaque = (voidpf)0;

    err = deflateInit(&stream, level);
    if (err != Z_OK) return err;

    stream.next_out = dest;
    stream.avail_out = 0;
    stream.next_in = (z_const Bytef *)source;
    stream.avail_in = 0;

    do {
        if (stream.avail_out == 0) {
            stream.avail_out = left > (uLong)max ? max : (uInt)left;
            left -= stream.avail_out;
        }
        if (stream.avail_in == 0) {
            stream.avail_in = sourceLen > (uLong)max ? max : (uInt)sourceLen;
            sourceLen -= stream.avail_in;
        }
        err = deflate(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH);
    } while (err == Z_OK);

    *destLen = stream.total_out;
    deflateEnd(&stream);
    return err == Z_STREAM_END ? Z_OK : err;
}

int ZEXPORT compress (dest, destLen, source, sourceLen)
    Bytef *dest;
    uLongf *destLen;
    const Bytef *source;
    uLong sourceLen;
{
    return compress2(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
}

通过参数,我们可以看到zlib的压缩确实像上面说的,没有涉及到文件操作,不读取硬盘上的数据文件进行压缩,只操作内存数据。那么我们来测试一下用compress函数压缩数据:

#include <stdio.h>
#include <string.h>
#include <zlib.h>
#include <stdlib.h>
int my_write(char* fname,const char * buffer);
int main(int argc,char** argv){
    if(argc == 1){
        printf("Please Input A String You want to Compress\n");
        return -1;
    }
    char * str = argv[1];
    printf("The String You Want to Compress is: %s\n",str);
    //计算需要压缩的字符的长度
    uLong sLen = strlen(str);
    //压缩之后字符的长度
    uLong tLen;
    //通过zlib API计算tlen的长度,以便接下来给压缩之后的数据分配空间
    tLen = compressBound(sLen);
    //分配压缩数据空间
    char * cspace;
    cspace = malloc(tLen);
    //是否分配成功
    if(cspace == NULL){
        printf("Not enough memory!\n");
        return -1;
    }
    //开始压缩
    int result;
    result = compress(cspace,&tLen,str,sLen);
    if(result == Z_OK){
        printf("Compress Sucess!\n");    
        //保存压缩内容到硬盘
        int j = my_write("compressdata.bin",cspace);
        if(j==0){
            printf("\t-Sucess to write into disk!\n");
        }else{
            printf("\t-Failure to write into disk!\n");
        }
    }
    //开始解压
    result = uncompress(str,&sLen,cspace,tLen);
    if(result == Z_OK){
        printf("Original String is: %s\n",str);
    }else{
        printf("uncompress failure!\n");
    }
    free(cspace);
    return 0;

}
int my_write(char* fname,const char * buffer)
{
    size_t writesize;
    FILE *pFile;
    pFile = fopen(fname,"wb");
    //fwrite返回区块数量
    writesize = fwrite(buffer,strlen(buffer),1,pFile);
    fclose(pFile);
    if(strlen(buffer)>0 && 1 == writesize){
        return 0;
    }else{
        return 1;
    }
}

编译运行:

[root@iZrj9hu97fjb3e1xlfktg8Z ~]# gcc zcompress.c  -I /tmp/zlib/include/ -lz -L /tmp/zlib/lib/ -o z
[root@iZrj9hu97fjb3e1xlfktg8Z ~]# ./z "This is the string"
The String You Want to Compress is: This is the string
Compress Sucess!
	-Sucess to write into disk!
Original String is: This is the string

到这里一切正常,那是不是zlib提供就是deflation算法呢?实则不然,zlib还提供了一种zlib数据格式,在原有压缩数据的基础上,添加头部和尾部信息,在HTTP/1.1协议中,content-encoding类型中的deflate,实际上不是deflation算法处理的原始数据,而是添加了头部和尾部信息的ZLIB格式。ZLIB的头部和尾部数据信息可以查看rfc1950。

那么gzip和zlib又有什么区别呢,在gzip的说明文档中,看到了这样的文字:
The deflation algorithm used by zip and gzip is a variation of LZ77
(Lempel-Ziv 1977, see reference below). It finds duplicated strings in
the input data. The second occurrence of a string is replaced by a
pointer to the previous string, in the form of a pair (distance,
length). Distances are limited to 32K bytes, and lengths are limited
to 258 bytes. When a string does not occur anywhere in the previous
32K bytes, it is emitted as a sequence of literal bytes. (In this
description, ‘string’ must be taken as an arbitrary sequence of bytes,
and is not restricted to printable characters.)


2. gzip file format

The pkzip format imposes a lot of overhead in various headers, which
are useful for an archiver but not necessary when only one file is
compressed. gzip uses a much simpler structure. Numbers are in little
endian format, and bit 0 is the least significant bit.
A gzip file is a sequence of compressed members. Each member has the
following structure:

2 bytes magic header 0x1f, 0x8b (\037 \213)
1 byte compression method (0..7 reserved, 8 = deflate)
1 byte flags
bit 0 set: file probably ascii text
bit 1 set: continuation of multi-part gzip file
bit 2 set: extra field present
bit 3 set: original file name present
bit 4 set: file comment present
bit 5 set: file is encrypted
bit 6,7: reserved
4 bytes file modification time in Unix format
1 byte extra flags (depend on compression method)
1 byte operating system on which compression took place

2 bytes optional part number (second part=1)
2 bytes optional extra field length
? bytes optional extra field
? bytes optional original file name, zero terminated
? bytes optional file comment, zero terminated
12 bytes optional encryption header
? bytes compressed data
4 bytes crc32
4 bytes uncompressed input size modulo 2^32

也就是说gzip也是使用deflation算法进行数据的压缩,只是存储格式和不一样,另外gzip只能处理当个文件,然zip格式则可以处理目录。

综上:
1,zlib提供一系列函数库,可以采用deflation算法对数据进行压缩,还提供ZLIB DATA FORMAT的跨平台的数据格式。
2,gzip是一个采用deflation算法处理文件的工具,提供比较小的头部和尾部信息,只支持处理单个文件。
3,zip的压缩算法也是采用deflation,它能够处理多个文件或者目录。
4,HTTP协议中的传输编码deflate,传输的数据不是经过deflation压缩的数据,而是zlib格式的数据,即在压缩的基础上,增加zlib头和尾数据。

参考资料

  1. Zlib
  2. http://www.zlib.net/zlib_faq.html

发表评论