From 39f89f86084efbdaabba05ad4ea598c9c72a318a Mon Sep 17 00:00:00 2001
From: "Sebastian J. Mielke" <sjm@sjmielke.com>
Date: Mon, 14 Oct 2019 17:00:35 -0400
Subject: [PATCH] Multiply word_size by 4 in parse_npy_header for dtype 'U'

For Unicode data, the NumPy "length" refers to characters, not bytes, specifically UCS-4/UTF-32 encoded characters, so we need 4 bytes of storage for each element.
---
 cnpy.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cnpy.cpp b/cnpy.cpp
index 2d28578..dcd683a 100644
--- a/cnpy.cpp
+++ b/cnpy.cpp
@@ -99,6 +99,10 @@ void cnpy::parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector
     std::string str_ws = header.substr(loc1+2);
     loc2 = str_ws.find("'");
     word_size = atoi(str_ws.substr(0,loc2).c_str());
+
+    //Special case: Unicode chars -- these have 4 bytes!
+    if(header[loc1+1] == 'U')
+        word_size *= 4;
 }
 
 void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {