Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,19 @@ g++ -o mycode mycode.cpp -L/path/to/install/dir -lcnpy -lz --std=c++11

# Description:

There are two functions for writing data: `npy_save` and `npz_save`.
There are three functions for writing data: `npy_save`, `npz_save`, and `npz_save_compressed`.

- `npy_save` saves a single array to a .npy file.
- `npz_save` saves arrays to a .npz file. Accepts an optional `compress` parameter (default: false) to enable zlib deflate compression.
- `npz_save_compressed` convenience wrapper that calls `npz_save` with compression enabled (equivalent to `numpy.savez_compressed`).

There are 3 functions for reading:
- `npy_load` will load a .npy file.
- `npz_load(fname)` will load a .npz and return a dictionary of NpyArray structues.
- `npz_load(fname)` will load a .npz and return a dictionary of NpyArray structures.
- `npz_load(fname,varname)` will load and return the NpyArray for data varname from the specified .npz file.

Both compressed and uncompressed .npz files are supported for reading, including files created by `numpy.savez_compressed()` which use ZIP64 format.

The data structure for loaded data is below.
Data is accessed via the `data<T>()`-method, which returns a pointer of the specified type (which must match the underlying datatype of the data).
The array shape and word size are read from the npy header.
Expand Down
40 changes: 34 additions & 6 deletions cnpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,24 @@ cnpy::NpyArray load_the_npy_file(FILE* fp) {
return arr;
}

// Helper function to parse ZIP64 extended info from extra field
void parse_zip64_sizes(const std::vector<char>& extra_field, uint32_t& compr_bytes, uint32_t& uncompr_bytes) {
if(extra_field.size() >= 4 && (compr_bytes == 0xFFFFFFFF || uncompr_bytes == 0xFFFFFFFF)) {
uint16_t extra_id = *reinterpret_cast<const uint16_t*>(&extra_field[0]);
uint16_t extra_size = *reinterpret_cast<const uint16_t*>(&extra_field[2]);
if(extra_id == 0x0001 && extra_size >= 16) { // ZIP64 extended info
size_t offset = 4;
if(uncompr_bytes == 0xFFFFFFFF) {
uncompr_bytes = static_cast<uint32_t>(*reinterpret_cast<const uint64_t*>(&extra_field[offset]));
offset += 8;
}
if(compr_bytes == 0xFFFFFFFF) {
compr_bytes = static_cast<uint32_t>(*reinterpret_cast<const uint64_t*>(&extra_field[offset]));
}
}
}
}

cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes) {

std::vector<unsigned char> buffer_compr(compr_bytes);
Expand Down Expand Up @@ -257,9 +275,9 @@ cnpy::npz_t cnpy::npz_load(std::string fname) {

//read in the extra field
uint16_t extra_field_len = *(uint16_t*) &local_header[28];
std::vector<char> extra_field(extra_field_len);
if(extra_field_len > 0) {
std::vector<char> buff(extra_field_len);
size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp);
size_t efield_res = fread(&extra_field[0],sizeof(char),extra_field_len,fp);
if(efield_res != extra_field_len)
throw std::runtime_error("npz_load: failed fread");
}
Expand All @@ -268,6 +286,9 @@ cnpy::npz_t cnpy::npz_load(std::string fname) {
uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);

// ZIP64 support: if sizes are 0xFFFFFFFF, read from extra field
parse_zip64_sizes(extra_field, compr_bytes, uncompr_bytes);

if(compr_method == 0) {arrays[varname] = load_the_npy_file(fp);}
else {arrays[varname] = load_the_npz_array(fp,compr_bytes,uncompr_bytes);}
}
Expand Down Expand Up @@ -300,21 +321,28 @@ cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {

//read in the extra field
uint16_t extra_field_len = *(uint16_t*) &local_header[28];
fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field
std::vector<char> extra_field(extra_field_len);
if(extra_field_len > 0) {
size_t efield_res = fread(&extra_field[0],sizeof(char),extra_field_len,fp);
if(efield_res != extra_field_len)
throw std::runtime_error("npz_load: failed fread");
}

uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);

// ZIP64 support: if sizes are 0xFFFFFFFF, read from extra field
parse_zip64_sizes(extra_field, compr_bytes, uncompr_bytes);

if(vname == varname) {
NpyArray array = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp,compr_bytes,uncompr_bytes);
fclose(fp);
return array;
}
else {
//skip past the data
uint32_t size = *(uint32_t*) &local_header[22];
fseek(fp,size,SEEK_CUR);
//skip past the data (use compr_bytes for compressed data)
fseek(fp,compr_bytes,SEEK_CUR);
}
}

Expand Down
94 changes: 80 additions & 14 deletions cnpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include<typeinfo>
#include<iostream>
#include<cassert>
#include<cstring>
#include<zlib.h>
#include<map>
#include<memory>
Expand Down Expand Up @@ -130,7 +131,8 @@ namespace cnpy {
fclose(fp);
}

template<typename T> void npz_save(std::string zipname, std::string fname, const T* data, const std::vector<size_t>& shape, std::string mode = "w")
// Save arrays to NPZ file, optionally with compression
template<typename T> void npz_save(std::string zipname, std::string fname, const T* data, const std::vector<size_t>& shape, std::string mode = "w", bool compress = false)
{
//first, append a .npy to the fname
fname += ".npy";
Expand Down Expand Up @@ -165,24 +167,73 @@ namespace cnpy {
std::vector<char> npy_header = create_npy_header<T>(shape);

size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies<size_t>());
size_t nbytes = nels*sizeof(T) + npy_header.size();

//get the CRC of the data to be added
uint32_t crc = crc32(0L,(uint8_t*)&npy_header[0],npy_header.size());
crc = crc32(crc,(uint8_t*)data,nels*sizeof(T));
size_t nbytes_uncompressed = nels*sizeof(T) + npy_header.size();

// Prepare data and compression parameters
std::vector<uint8_t> buffer_compressed;
size_t nbytes_on_disk;
uint16_t compression_method;
uint32_t crc;

if(compress) {
// Create uncompressed buffer (header + data)
std::vector<uint8_t> uncompressed(nbytes_uncompressed);
memcpy(&uncompressed[0], &npy_header[0], npy_header.size());
memcpy(&uncompressed[npy_header.size()], data, nels*sizeof(T));

// Get CRC of uncompressed data
crc = crc32(0L, &uncompressed[0], nbytes_uncompressed);

// Compress using zlib deflate (raw deflate, no zlib/gzip header)
uLongf max_compressed_size = compressBound(nbytes_uncompressed);
buffer_compressed.resize(max_compressed_size);

z_stream strm;
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
int ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, 8, Z_DEFAULT_STRATEGY);
if(ret != Z_OK) {
fclose(fp);
throw std::runtime_error("npz_save: deflateInit2 failed");
}

strm.avail_in = nbytes_uncompressed;
strm.next_in = &uncompressed[0];
strm.avail_out = max_compressed_size;
strm.next_out = &buffer_compressed[0];

ret = deflate(&strm, Z_FINISH);
if(ret != Z_STREAM_END) {
deflateEnd(&strm);
fclose(fp);
throw std::runtime_error("npz_save: deflate failed");
}

nbytes_on_disk = strm.total_out;
deflateEnd(&strm);
compression_method = 8; // deflate
}
else {
// No compression - CRC computed in two parts
crc = crc32(0L,(uint8_t*)&npy_header[0],npy_header.size());
crc = crc32(crc,(uint8_t*)data,nels*sizeof(T));
nbytes_on_disk = nbytes_uncompressed;
compression_method = 0; // store
}

//build the local header
std::vector<char> local_header;
local_header += "PK"; //first part of sig
local_header += (uint16_t) 0x0403; //second part of sig
local_header += (uint16_t) 20; //min version to extract
local_header += (uint16_t) 0; //general purpose bit flag
local_header += (uint16_t) 0; //compression method
local_header += (uint16_t) compression_method; //compression method
local_header += (uint16_t) 0; //file last mod time
local_header += (uint16_t) 0; //file last mod date
local_header += (uint32_t) crc; //crc
local_header += (uint32_t) nbytes; //compressed size
local_header += (uint32_t) nbytes; //uncompressed size
local_header += (uint32_t) nbytes_on_disk; //compressed size
local_header += (uint32_t) nbytes_uncompressed; //uncompressed size
local_header += (uint16_t) fname.size(); //fname length
local_header += (uint16_t) 0; //extra field length
local_header += fname;
Expand All @@ -208,13 +259,17 @@ namespace cnpy {
footer += (uint16_t) (nrecs+1); //number of records on this disk
footer += (uint16_t) (nrecs+1); //total number of records
footer += (uint32_t) global_header.size(); //nbytes of global headers
footer += (uint32_t) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array
footer += (uint32_t) (global_header_offset + nbytes_on_disk + local_header.size()); //offset of start of global headers, since global header now starts after newly written array
footer += (uint16_t) 0; //zip file comment length

//write everything
fwrite(&local_header[0],sizeof(char),local_header.size(),fp);
fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
fwrite(data,sizeof(T),nels,fp);
if(compress) {
fwrite(&buffer_compressed[0],sizeof(char),nbytes_on_disk,fp);
} else {
fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
fwrite(data,sizeof(T),nels,fp);
}
fwrite(&global_header[0],sizeof(char),global_header.size(),fp);
fwrite(&footer[0],sizeof(char),footer.size(),fp);
fclose(fp);
Expand All @@ -226,10 +281,21 @@ namespace cnpy {
npy_save(fname, &data[0], shape, mode);
}

template<typename T> void npz_save(std::string zipname, std::string fname, const std::vector<T> data, std::string mode = "w") {
template<typename T> void npz_save(std::string zipname, std::string fname, const std::vector<T> data, std::string mode = "w", bool compress = false) {
std::vector<size_t> shape;
shape.push_back(data.size());
npz_save(zipname, fname, &data[0], shape, mode, compress);
}

// Convenience wrapper for compressed save
template<typename T> void npz_save_compressed(std::string zipname, std::string fname, const T* data, const std::vector<size_t>& shape, std::string mode = "w") {
npz_save(zipname, fname, data, shape, mode, true);
}

template<typename T> void npz_save_compressed(std::string zipname, std::string fname, const std::vector<T> data, std::string mode = "w") {
std::vector<size_t> shape;
shape.push_back(data.size());
npz_save(zipname, fname, &data[0], shape, mode);
npz_save(zipname, fname, &data[0], shape, mode, true);
}

template<typename T> std::vector<char> create_npy_header(const std::vector<size_t>& shape) {
Expand Down
29 changes: 29 additions & 0 deletions example1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,33 @@ int main()
double* mv1 = arr_mv1.data<double>();
assert(arr_mv1.shape.size() == 1 && arr_mv1.shape[0] == 1);
assert(mv1[0] == myVar1);

//now test compressed npz files
std::cout << "Testing compressed npz save/load..." << std::endl;

//save compressed data
cnpy::npz_save_compressed("out_compressed.npz","myVar1",&myVar1,{1},"w");
cnpy::npz_save_compressed("out_compressed.npz","arr1",&data[0],{Nz,Ny,Nx},"a");

//load and verify the compressed file
cnpy::npz_t my_npz_compressed = cnpy::npz_load("out_compressed.npz");

//check myVar1
cnpy::NpyArray arr_mv1_c = my_npz_compressed["myVar1"];
double* mv1_c = arr_mv1_c.data<double>();
assert(arr_mv1_c.shape.size() == 1 && arr_mv1_c.shape[0] == 1);
assert(mv1_c[0] == myVar1);

//check arr1
cnpy::NpyArray arr1_c = my_npz_compressed["arr1"];
std::complex<double>* arr1_data_c = arr1_c.data<std::complex<double>>();
assert(arr1_c.shape.size() == 3 && arr1_c.shape[0] == Nz && arr1_c.shape[1] == Ny && arr1_c.shape[2] == Nx);
for(int i = 0; i < Nx*Ny*Nz; i++) assert(data[i] == arr1_data_c[i]);

//also test loading a single variable from compressed file
cnpy::NpyArray arr1_single = cnpy::npz_load("out_compressed.npz", "arr1");
std::complex<double>* arr1_single_data = arr1_single.data<std::complex<double>>();
for(int i = 0; i < Nx*Ny*Nz; i++) assert(data[i] == arr1_single_data[i]);

std::cout << "Compressed npz test passed!" << std::endl;
}