From 39f89f86084efbdaabba05ad4ea598c9c72a318a Mon Sep 17 00:00:00 2001 From: "Sebastian J. Mielke" Date: Mon, 14 Oct 2019 17:00:35 -0400 Subject: [PATCH] Multiply word_size by 4 in parse_npy_header for dtype 'U' For Unicode data, the NumPy "length" refers to characters, not bytes, specifically UCS-4/UTF-32 encoded characters, so we need 4 bytes of storage for each element. --- cnpy.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cnpy.cpp b/cnpy.cpp index 2d28578..dcd683a 100644 --- a/cnpy.cpp +++ b/cnpy.cpp @@ -99,6 +99,10 @@ void cnpy::parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector std::string str_ws = header.substr(loc1+2); loc2 = str_ws.find("'"); word_size = atoi(str_ws.substr(0,loc2).c_str()); + + //Special case: Unicode chars -- these have 4 bytes! + if(header[loc1+1] == 'U') + word_size *= 4; } void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector& shape, bool& fortran_order) {