diff --git a/README.md b/README.md index 37c4a43..95cf49e 100644 --- a/README.md +++ b/README.md @@ -33,13 +33,19 @@ g++ -o mycode mycode.cpp -L/path/to/install/dir -lcnpy -lz --std=c++11 # Description: -There are two functions for writing data: `npy_save` and `npz_save`. +There are three functions for writing data: `npy_save`, `npz_save`, and `npz_save_compressed`. + +- `npy_save` saves a single array to a .npy file. +- `npz_save` saves arrays to a .npz file. Accepts an optional `compress` parameter (default: false) to enable zlib deflate compression. +- `npz_save_compressed` convenience wrapper that calls `npz_save` with compression enabled (equivalent to `numpy.savez_compressed`). There are 3 functions for reading: - `npy_load` will load a .npy file. -- `npz_load(fname)` will load a .npz and return a dictionary of NpyArray structues. +- `npz_load(fname)` will load a .npz and return a dictionary of NpyArray structures. - `npz_load(fname,varname)` will load and return the NpyArray for data varname from the specified .npz file. +Both compressed and uncompressed .npz files are supported for reading, including files created by `numpy.savez_compressed()` which use ZIP64 format. + The data structure for loaded data is below. Data is accessed via the `data()`-method, which returns a pointer of the specified type (which must match the underlying datatype of the data). The array shape and word size are read from the npy header. diff --git a/cnpy.cpp b/cnpy.cpp index 2d28578..44b589a 100644 --- a/cnpy.cpp +++ b/cnpy.cpp @@ -188,6 +188,24 @@ cnpy::NpyArray load_the_npy_file(FILE* fp) { return arr; } +// Helper function to parse ZIP64 extended info from extra field +void parse_zip64_sizes(const std::vector& extra_field, uint32_t& compr_bytes, uint32_t& uncompr_bytes) { + if(extra_field.size() >= 4 && (compr_bytes == 0xFFFFFFFF || uncompr_bytes == 0xFFFFFFFF)) { + uint16_t extra_id = *reinterpret_cast(&extra_field[0]); + uint16_t extra_size = *reinterpret_cast(&extra_field[2]); + if(extra_id == 0x0001 && extra_size >= 16) { // ZIP64 extended info + size_t offset = 4; + if(uncompr_bytes == 0xFFFFFFFF) { + uncompr_bytes = static_cast(*reinterpret_cast(&extra_field[offset])); + offset += 8; + } + if(compr_bytes == 0xFFFFFFFF) { + compr_bytes = static_cast(*reinterpret_cast(&extra_field[offset])); + } + } + } +} + cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes) { std::vector buffer_compr(compr_bytes); @@ -257,9 +275,9 @@ cnpy::npz_t cnpy::npz_load(std::string fname) { //read in the extra field uint16_t extra_field_len = *(uint16_t*) &local_header[28]; + std::vector extra_field(extra_field_len); if(extra_field_len > 0) { - std::vector buff(extra_field_len); - size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp); + size_t efield_res = fread(&extra_field[0],sizeof(char),extra_field_len,fp); if(efield_res != extra_field_len) throw std::runtime_error("npz_load: failed fread"); } @@ -268,6 +286,9 @@ cnpy::npz_t cnpy::npz_load(std::string fname) { uint32_t compr_bytes = *reinterpret_cast(&local_header[0]+18); uint32_t uncompr_bytes = *reinterpret_cast(&local_header[0]+22); + // ZIP64 support: if sizes are 0xFFFFFFFF, read from extra field + parse_zip64_sizes(extra_field, compr_bytes, uncompr_bytes); + if(compr_method == 0) {arrays[varname] = load_the_npy_file(fp);} else {arrays[varname] = load_the_npz_array(fp,compr_bytes,uncompr_bytes);} } @@ -300,21 +321,28 @@ cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) { //read in the extra field uint16_t extra_field_len = *(uint16_t*) &local_header[28]; - fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field + std::vector extra_field(extra_field_len); + if(extra_field_len > 0) { + size_t efield_res = fread(&extra_field[0],sizeof(char),extra_field_len,fp); + if(efield_res != extra_field_len) + throw std::runtime_error("npz_load: failed fread"); + } uint16_t compr_method = *reinterpret_cast(&local_header[0]+8); uint32_t compr_bytes = *reinterpret_cast(&local_header[0]+18); uint32_t uncompr_bytes = *reinterpret_cast(&local_header[0]+22); + // ZIP64 support: if sizes are 0xFFFFFFFF, read from extra field + parse_zip64_sizes(extra_field, compr_bytes, uncompr_bytes); + if(vname == varname) { NpyArray array = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp,compr_bytes,uncompr_bytes); fclose(fp); return array; } else { - //skip past the data - uint32_t size = *(uint32_t*) &local_header[22]; - fseek(fp,size,SEEK_CUR); + //skip past the data (use compr_bytes for compressed data) + fseek(fp,compr_bytes,SEEK_CUR); } } diff --git a/cnpy.h b/cnpy.h index 0d3bb4c..b9f4a08 100644 --- a/cnpy.h +++ b/cnpy.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -130,7 +131,8 @@ namespace cnpy { fclose(fp); } - template void npz_save(std::string zipname, std::string fname, const T* data, const std::vector& shape, std::string mode = "w") + // Save arrays to NPZ file, optionally with compression + template void npz_save(std::string zipname, std::string fname, const T* data, const std::vector& shape, std::string mode = "w", bool compress = false) { //first, append a .npy to the fname fname += ".npy"; @@ -165,11 +167,60 @@ namespace cnpy { std::vector npy_header = create_npy_header(shape); size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies()); - size_t nbytes = nels*sizeof(T) + npy_header.size(); - - //get the CRC of the data to be added - uint32_t crc = crc32(0L,(uint8_t*)&npy_header[0],npy_header.size()); - crc = crc32(crc,(uint8_t*)data,nels*sizeof(T)); + size_t nbytes_uncompressed = nels*sizeof(T) + npy_header.size(); + + // Prepare data and compression parameters + std::vector buffer_compressed; + size_t nbytes_on_disk; + uint16_t compression_method; + uint32_t crc; + + if(compress) { + // Create uncompressed buffer (header + data) + std::vector uncompressed(nbytes_uncompressed); + memcpy(&uncompressed[0], &npy_header[0], npy_header.size()); + memcpy(&uncompressed[npy_header.size()], data, nels*sizeof(T)); + + // Get CRC of uncompressed data + crc = crc32(0L, &uncompressed[0], nbytes_uncompressed); + + // Compress using zlib deflate (raw deflate, no zlib/gzip header) + uLongf max_compressed_size = compressBound(nbytes_uncompressed); + buffer_compressed.resize(max_compressed_size); + + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + int ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, 8, Z_DEFAULT_STRATEGY); + if(ret != Z_OK) { + fclose(fp); + throw std::runtime_error("npz_save: deflateInit2 failed"); + } + + strm.avail_in = nbytes_uncompressed; + strm.next_in = &uncompressed[0]; + strm.avail_out = max_compressed_size; + strm.next_out = &buffer_compressed[0]; + + ret = deflate(&strm, Z_FINISH); + if(ret != Z_STREAM_END) { + deflateEnd(&strm); + fclose(fp); + throw std::runtime_error("npz_save: deflate failed"); + } + + nbytes_on_disk = strm.total_out; + deflateEnd(&strm); + compression_method = 8; // deflate + } + else { + // No compression - CRC computed in two parts + crc = crc32(0L,(uint8_t*)&npy_header[0],npy_header.size()); + crc = crc32(crc,(uint8_t*)data,nels*sizeof(T)); + nbytes_on_disk = nbytes_uncompressed; + compression_method = 0; // store + } //build the local header std::vector local_header; @@ -177,12 +228,12 @@ namespace cnpy { local_header += (uint16_t) 0x0403; //second part of sig local_header += (uint16_t) 20; //min version to extract local_header += (uint16_t) 0; //general purpose bit flag - local_header += (uint16_t) 0; //compression method + local_header += (uint16_t) compression_method; //compression method local_header += (uint16_t) 0; //file last mod time local_header += (uint16_t) 0; //file last mod date local_header += (uint32_t) crc; //crc - local_header += (uint32_t) nbytes; //compressed size - local_header += (uint32_t) nbytes; //uncompressed size + local_header += (uint32_t) nbytes_on_disk; //compressed size + local_header += (uint32_t) nbytes_uncompressed; //uncompressed size local_header += (uint16_t) fname.size(); //fname length local_header += (uint16_t) 0; //extra field length local_header += fname; @@ -208,13 +259,17 @@ namespace cnpy { footer += (uint16_t) (nrecs+1); //number of records on this disk footer += (uint16_t) (nrecs+1); //total number of records footer += (uint32_t) global_header.size(); //nbytes of global headers - footer += (uint32_t) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array + footer += (uint32_t) (global_header_offset + nbytes_on_disk + local_header.size()); //offset of start of global headers, since global header now starts after newly written array footer += (uint16_t) 0; //zip file comment length //write everything fwrite(&local_header[0],sizeof(char),local_header.size(),fp); - fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp); - fwrite(data,sizeof(T),nels,fp); + if(compress) { + fwrite(&buffer_compressed[0],sizeof(char),nbytes_on_disk,fp); + } else { + fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp); + fwrite(data,sizeof(T),nels,fp); + } fwrite(&global_header[0],sizeof(char),global_header.size(),fp); fwrite(&footer[0],sizeof(char),footer.size(),fp); fclose(fp); @@ -226,10 +281,21 @@ namespace cnpy { npy_save(fname, &data[0], shape, mode); } - template void npz_save(std::string zipname, std::string fname, const std::vector data, std::string mode = "w") { + template void npz_save(std::string zipname, std::string fname, const std::vector data, std::string mode = "w", bool compress = false) { + std::vector shape; + shape.push_back(data.size()); + npz_save(zipname, fname, &data[0], shape, mode, compress); + } + + // Convenience wrapper for compressed save + template void npz_save_compressed(std::string zipname, std::string fname, const T* data, const std::vector& shape, std::string mode = "w") { + npz_save(zipname, fname, data, shape, mode, true); + } + + template void npz_save_compressed(std::string zipname, std::string fname, const std::vector data, std::string mode = "w") { std::vector shape; shape.push_back(data.size()); - npz_save(zipname, fname, &data[0], shape, mode); + npz_save(zipname, fname, &data[0], shape, mode, true); } template std::vector create_npy_header(const std::vector& shape) { diff --git a/example1.cpp b/example1.cpp index 70ac5aa..1e4cf90 100644 --- a/example1.cpp +++ b/example1.cpp @@ -52,4 +52,33 @@ int main() double* mv1 = arr_mv1.data(); assert(arr_mv1.shape.size() == 1 && arr_mv1.shape[0] == 1); assert(mv1[0] == myVar1); + + //now test compressed npz files + std::cout << "Testing compressed npz save/load..." << std::endl; + + //save compressed data + cnpy::npz_save_compressed("out_compressed.npz","myVar1",&myVar1,{1},"w"); + cnpy::npz_save_compressed("out_compressed.npz","arr1",&data[0],{Nz,Ny,Nx},"a"); + + //load and verify the compressed file + cnpy::npz_t my_npz_compressed = cnpy::npz_load("out_compressed.npz"); + + //check myVar1 + cnpy::NpyArray arr_mv1_c = my_npz_compressed["myVar1"]; + double* mv1_c = arr_mv1_c.data(); + assert(arr_mv1_c.shape.size() == 1 && arr_mv1_c.shape[0] == 1); + assert(mv1_c[0] == myVar1); + + //check arr1 + cnpy::NpyArray arr1_c = my_npz_compressed["arr1"]; + std::complex* arr1_data_c = arr1_c.data>(); + assert(arr1_c.shape.size() == 3 && arr1_c.shape[0] == Nz && arr1_c.shape[1] == Ny && arr1_c.shape[2] == Nx); + for(int i = 0; i < Nx*Ny*Nz; i++) assert(data[i] == arr1_data_c[i]); + + //also test loading a single variable from compressed file + cnpy::NpyArray arr1_single = cnpy::npz_load("out_compressed.npz", "arr1"); + std::complex* arr1_single_data = arr1_single.data>(); + for(int i = 0; i < Nx*Ny*Nz; i++) assert(data[i] == arr1_single_data[i]); + + std::cout << "Compressed npz test passed!" << std::endl; }