From 5be101a6b837dd2701a6f5f09daebba60c4947d3 Mon Sep 17 00:00:00 2001 From: lauryntalbot <164897268+lauryntalbot@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:33:18 -0800 Subject: [PATCH 1/6] Refactor make_gridfiles to include maskfunction --- pyglider/ncprocess.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index 332e7f4..7824e29 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -187,8 +187,7 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): def make_gridfiles( - inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01' -): + inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01', maskfunction=CPROOF_mask): """ Turn a timeseries netCDF file into a vertically gridded netCDF. @@ -223,12 +222,14 @@ def make_gridfiles( profile_meta = deployment['profile_variables'] - ds = xr.open_dataset(inname, decode_times=True) - ds = ds.where(ds.time > np.datetime64(starttime), drop=True) + ds0 = xr.open_dataset(inname, decode_times=True) + ds0 = ds0.where(ds0.time > np.datetime64(starttime), drop=True) _log.info(f'Working on: {inname}') - _log.debug(str(ds)) - _log.debug(str(ds.time[0])) - _log.debug(str(ds.time[-1])) + _log.debug(str(ds0)) + _log.debug(str(ds0.time[0])) + _log.debug(str(ds0.time[-1])) + + ds = maskfunction(ds0) profiles = np.unique(ds.profile_index) profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) and (p > 0))] @@ -271,32 +272,43 @@ def make_gridfiles( _log.info(f'Done times! {len(dat)}') dsout['profile_time_start'] = ((xdimname), dat, profile_meta['profile_time_start']) dsout['profile_time_end'] = ((xdimname), dat, profile_meta['profile_time_end']) - + for k in ds.keys(): if k in ['time', 'profile', 'longitude', 'latitude', 'depth'] or 'time' in k: continue _log.info('Gridding %s', k) good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] + if len(good) <= 0: continue if 'average_method' in ds[k].attrs: - average_method = ds[k].attrs['average_method'] + method = ds[k].attrs['average_method'] ds[k].attrs['processing'] = ( f'Using average method {average_method} for ' f'variable {k} following deployment yaml.' ) - if average_method == 'geometric mean': - average_method = stats.gmean + if method == 'geometric mean': + method = stats.gmean ds[k].attrs['processing'] += ( ' Using geometric mean implementation ' 'scipy.stats.gmean' ) + elif 'QC_protocol' in ds[k].attrs: + method = ds[k].attrs['max_method'] + ds[k].attrs['processing'] = ( + f'Taking the maximum quality flag for ' + f'variable {k} following deployment yaml.' + ) + method = np.nanmax + else: - average_method = 'mean' + method = 'mean' + + dat, xedges, yedges, binnumber = stats.binned_statistic_2d( ds['profile_index'].values[good], ds['depth'].values[good], values=ds[k].values[good], - statistic=average_method, + statistic=method, bins=[profile_bins, depth_bins], ) @@ -361,7 +373,7 @@ def make_gridfiles( _log.info('Done gridding') return outname - + # aliases extract_L0timeseries_profiles = extract_timeseries_profiles From 663222ad1bf28a5617f38935be2abe3cfb1159cc Mon Sep 17 00:00:00 2001 From: lauryntalbot <164897268+lauryntalbot@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:39:37 -0800 Subject: [PATCH 2/6] Modify gappy_fill_vertical to include max_gap parameter Updated gappy_fill_vertical to fill small NaN runs based on max_gap parameter. --- pyglider/utils.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/pyglider/utils.py b/pyglider/utils.py index 532898e..04cb422 100644 --- a/pyglider/utils.py +++ b/pyglider/utils.py @@ -657,25 +657,40 @@ def _passthrough(val): return val -def gappy_fill_vertical(data): +def gappy_fill_vertical(data, max_gap=50): """ - Fill vertical gaps from the first to last bin with data in them. - Applied column-wise. - - data = gappy_fill_vertical(data) + Fill ONLY small NaN runs (<= max_gap bins) inside each vertical column. + No extrapolation at top/bottom edges. """ m, n = np.shape(data) for j in range(n): - ind = np.where(~np.isnan(data[:, j]))[0] - if ( - len(ind) > 0 - and len(ind) < (ind[-1] - ind[0]) - and len(ind) > (ind[-1] - ind[0]) * 0.05 - ): - int = np.arange(ind[0], ind[-1]) - data[:, j][ind[0] : ind[-1]] = np.interp(int, ind, data[ind, j]) - return data + col = data[:, j] + ind = np.where(~np.isnan(col))[0] + if len(ind) < 2: + continue + + lo, hi = ind[0], ind[-1] + sub = col[lo:hi+1] + isn = np.isnan(sub) + if not isn.any(): + continue + + x = isn.astype(np.int8) + edges = np.diff(np.r_[0, x, 0]) + starts = np.where(edges == 1)[0] + ends = np.where(edges == -1)[0] # exclusive + + for s, e in zip(starts, ends): + run_len = e - s + if run_len <= max_gap and s > 0 and e < len(sub): + # bracketed by finite values + if np.isfinite(sub[s-1]) and np.isfinite(sub[e]): + sub[s:e] = np.interp(np.arange(s, e), [s-1, e], [sub[s-1], sub[e]]) + + col[lo:hi+1] = sub + data[:, j] = col + return data def find_gaps(sample_time, timebase, maxgap): """ From d9b6dc29bad7085fbf2bdca182e56dc870138a7f Mon Sep 17 00:00:00 2001 From: lauryntalbot <164897268+lauryntalbot@users.noreply.github.com> Date: Thu, 5 Mar 2026 15:28:43 -0800 Subject: [PATCH 3/6] Implement CPROOF_mask function and update gridding Add CPROOF_mask function to mask QC4 samples in data variables, and update make_gridfiles to apply this mask function before gridding. --- pyglider/ncprocess.py | 64 +++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index 7824e29..b90fa92 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -186,6 +186,30 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): nc.renameDimension('string%d' % trajlen, 'traj_strlen') +def CPROOF_mask(ds): + """Mask QC4 samples in data variables (set to NaN) so gridding ignores them. + Does NOT shorten arrays or overwrite QC variables. + """ + _log = logging.getLogger(__name__) + + ds = ds.copy() + + for k in list(ds.data_vars): + # skip QC variables themselves + if k.endswith("_QC"): + continue + + qc_name = f"{k}_QC" + if qc_name not in ds: + continue + + # mask data where QC == 4, preserving dims/coords + ds[k] = ds[k].where(ds[qc_name] != 4) + ds[qc_name] = ds[qc_name].where(ds[qc_name] != 4) + + return ds + + def make_gridfiles( inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01', maskfunction=CPROOF_mask): """ @@ -206,6 +230,11 @@ def make_gridfiles( dz : float, default = 1 Vertical grid spacing in meters. + maskfunction : callable or None, optional + Function applied to the dataset before gridding. By default, CPROOF_mask + masks QC4 samples in data variables by setting them to NaN so they are + ignored during gridding. Set to None to skip masking. + Returns ------- outname : str @@ -222,14 +251,17 @@ def make_gridfiles( profile_meta = deployment['profile_variables'] - ds0 = xr.open_dataset(inname, decode_times=True) - ds0 = ds0.where(ds0.time > np.datetime64(starttime), drop=True) + ds = xr.open_dataset(inname, decode_times=True) + + if maskfunction is not None: + ds = maskfunction(ds) + + ds = ds.where(ds.time > np.datetime64(starttime), drop=True) _log.info(f'Working on: {inname}') - _log.debug(str(ds0)) - _log.debug(str(ds0.time[0])) - _log.debug(str(ds0.time[-1])) + _log.debug(str(ds)) + _log.debug(str(ds.time[0])) + _log.debug(str(ds.time[-1])) - ds = maskfunction(ds0) profiles = np.unique(ds.profile_index) profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) and (p > 0))] @@ -284,7 +316,7 @@ def make_gridfiles( if 'average_method' in ds[k].attrs: method = ds[k].attrs['average_method'] ds[k].attrs['processing'] = ( - f'Using average method {average_method} for ' + f'Using average method {method} for ' f'variable {k} following deployment yaml.' ) if method == 'geometric mean': @@ -293,17 +325,19 @@ def make_gridfiles( ' Using geometric mean implementation ' 'scipy.stats.gmean' ) elif 'QC_protocol' in ds[k].attrs: - method = ds[k].attrs['max_method'] + # QC variables are treated as discrete flags rather than continuous data. + # If a variable has a QC_protocol attribute, it is gridded using the + # maximum flag in each bin (e.g. any QC3 in a bin makes the gridded bin QC3). + method = np.nanmax ds[k].attrs['processing'] = ( f'Taking the maximum quality flag for ' f'variable {k} following deployment yaml.' ) - method = np.nanmax + else: method = 'mean' - dat, xedges, yedges, binnumber = stats.binned_statistic_2d( ds['profile_index'].values[good], ds['depth'].values[good], @@ -315,9 +349,10 @@ def make_gridfiles( _log.debug(f'dat{np.shape(dat)}') dsout[k] = (('depth', xdimname), dat.T, ds[k].attrs) - # fill gaps in data: - dsout[k].values = utils.gappy_fill_vertical(dsout[k].values) - + # Fill only continuous variables, not QC flags + if 'QC_protocol' not in ds[k].attrs : + dsout[k] = dsout[k].interpolate_na(dim="depth", method="linear", max_gap=50) + # fix u and v, because they should really not be gridded... if ('water_velocity_eastward' in dsout.keys()) and ('u' in profile_meta.keys()): _log.debug(str(ds.water_velocity_eastward)) @@ -373,7 +408,8 @@ def make_gridfiles( _log.info('Done gridding') return outname - + + # aliases extract_L0timeseries_profiles = extract_timeseries_profiles From 341dfac95b15a05857fb1e628a92d2f67d39883b Mon Sep 17 00:00:00 2001 From: lauryntalbot <164897268+lauryntalbot@users.noreply.github.com> Date: Thu, 5 Mar 2026 15:31:53 -0800 Subject: [PATCH 4/6] Update utils.py Added original function back --- pyglider/utils.py | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/pyglider/utils.py b/pyglider/utils.py index 04cb422..1fae580 100644 --- a/pyglider/utils.py +++ b/pyglider/utils.py @@ -657,39 +657,23 @@ def _passthrough(val): return val -def gappy_fill_vertical(data, max_gap=50): +def gappy_fill_vertical(data): """ - Fill ONLY small NaN runs (<= max_gap bins) inside each vertical column. - No extrapolation at top/bottom edges. + Fill vertical gaps from the first to last bin with data in them. + Applied column-wise. + + data = gappy_fill_vertical(data) """ m, n = np.shape(data) for j in range(n): - col = data[:, j] - ind = np.where(~np.isnan(col))[0] - if len(ind) < 2: - continue - - lo, hi = ind[0], ind[-1] - sub = col[lo:hi+1] - isn = np.isnan(sub) - if not isn.any(): - continue - - x = isn.astype(np.int8) - edges = np.diff(np.r_[0, x, 0]) - starts = np.where(edges == 1)[0] - ends = np.where(edges == -1)[0] # exclusive - - for s, e in zip(starts, ends): - run_len = e - s - if run_len <= max_gap and s > 0 and e < len(sub): - # bracketed by finite values - if np.isfinite(sub[s-1]) and np.isfinite(sub[e]): - sub[s:e] = np.interp(np.arange(s, e), [s-1, e], [sub[s-1], sub[e]]) - - col[lo:hi+1] = sub - data[:, j] = col - + ind = np.where(~np.isnan(data[:, j]))[0] + if ( + len(ind) > 0 + and len(ind) < (ind[-1] - ind[0]) + and len(ind) > (ind[-1] - ind[0]) * 0.05 + ): + int = np.arange(ind[0], ind[-1]) + data[:, j][ind[0] : ind[-1]] = np.interp(int, ind, data[ind, j]) return data def find_gaps(sample_time, timebase, maxgap): From d8c8a3cf1910f1eef4984c8fbda37dee531700bd Mon Sep 17 00:00:00 2001 From: lauryntalbot <164897268+lauryntalbot@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:04:41 -0800 Subject: [PATCH 5/6] Update ncprocess.py --- pyglider/ncprocess.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index b90fa92..c148aed 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -211,7 +211,7 @@ def CPROOF_mask(ds): def make_gridfiles( - inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01', maskfunction=CPROOF_mask): + inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01', maskfunction=None): """ Turn a timeseries netCDF file into a vertically gridded netCDF. @@ -231,9 +231,7 @@ def make_gridfiles( Vertical grid spacing in meters. maskfunction : callable or None, optional - Function applied to the dataset before gridding. By default, CPROOF_mask - masks QC4 samples in data variables by setting them to NaN so they are - ignored during gridding. Set to None to skip masking. + Function applied to the dataset before gridding. Returns ------- @@ -314,6 +312,9 @@ def make_gridfiles( if len(good) <= 0: continue if 'average_method' in ds[k].attrs: + # variables are treated as d continuous data. + # If a variable has a average_method attribute, it is gridded using the + # mean in each bin method = ds[k].attrs['average_method'] ds[k].attrs['processing'] = ( f'Using average method {method} for ' @@ -410,6 +411,10 @@ def make_gridfiles( return outname + + + + # aliases extract_L0timeseries_profiles = extract_timeseries_profiles From 4042466fb76b450e7a75d351747d6dd6965a1a9a Mon Sep 17 00:00:00 2001 From: lauryntalbot <164897268+lauryntalbot@users.noreply.github.com> Date: Fri, 6 Mar 2026 09:45:25 -0800 Subject: [PATCH 6/6] Implement vertical interpolation for variables Added vertical interpolation for QC and continuous variables in ncprocess.py. --- pyglider/ncprocess.py | 77 ++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 45 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index c148aed..0d7ddfb 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -208,10 +208,19 @@ def CPROOF_mask(ds): ds[qc_name] = ds[qc_name].where(ds[qc_name] != 4) return ds - +def interpolate_vertical(var, attr): + # QC variables: fill interpolatable NaN gaps with 1 + if 'QC_protocol' in attr.attrs.values(): + interp = var.interpolate_na(dim="depth", method="nearest", max_gap=50) + filled = np.isnan(var) & np.isfinite(interp) + return xr.where(filled, 1, var) + + # Continuous variables: linear interpolation + return var.interpolate_na(dim="depth", method="linear", max_gap=50) + def make_gridfiles( - inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01', maskfunction=None): + inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01', maskfunction=None, interp_variables=None): """ Turn a timeseries netCDF file into a vertically gridded netCDF. @@ -231,7 +240,7 @@ def make_gridfiles( Vertical grid spacing in meters. maskfunction : callable or None, optional - Function applied to the dataset before gridding. + Function applied to the dataset before gridding, usually to choose what data will be set to NaN based on quality flags. Returns ------- @@ -246,9 +255,7 @@ def make_gridfiles( pass deployment = utils._get_deployment(deploymentyaml) - profile_meta = deployment['profile_variables'] - ds = xr.open_dataset(inname, decode_times=True) if maskfunction is not None: @@ -260,7 +267,6 @@ def make_gridfiles( _log.debug(str(ds.time[0])) _log.debug(str(ds.time[-1])) - profiles = np.unique(ds.profile_index) profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) and (p > 0))] profile_bins = np.hstack((np.array(profiles) - 0.5, [profiles[-1] + 0.5])) @@ -310,49 +316,36 @@ def make_gridfiles( good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] if len(good) <= 0: - continue - if 'average_method' in ds[k].attrs: - # variables are treated as d continuous data. - # If a variable has a average_method attribute, it is gridded using the - # mean in each bin - method = ds[k].attrs['average_method'] - ds[k].attrs['processing'] = ( - f'Using average method {method} for ' - f'variable {k} following deployment yaml.' - ) - if method == 'geometric mean': - method = stats.gmean - ds[k].attrs['processing'] += ( - ' Using geometric mean implementation ' 'scipy.stats.gmean' - ) - elif 'QC_protocol' in ds[k].attrs: + continue + if 'QC_protocol' in ds[k].attrs.values(): # QC variables are treated as discrete flags rather than continuous data. # If a variable has a QC_protocol attribute, it is gridded using the # maximum flag in each bin (e.g. any QC3 in a bin makes the gridded bin QC3). method = np.nanmax - ds[k].attrs['processing'] = ( - f'Taking the maximum quality flag for ' - f'variable {k} following deployment yaml.' - ) - - else: - method = 'mean' - + # variables are treated as continuous data. + # If a variable has a average_method attribute, it is gridded using the + # mean in each bin + if 'average_method' in ds[k].attrs.values(): + method = ds[k].attrs['average_method'] + if method == 'geometric mean': + method = stats.gmean + else: + method = 'mean' + dat, xedges, yedges, binnumber = stats.binned_statistic_2d( - ds['profile_index'].values[good], - ds['depth'].values[good], - values=ds[k].values[good], - statistic=method, - bins=[profile_bins, depth_bins], - ) + ds['profile_index'].values[good], + ds['depth'].values[good], + values=ds[k].values[good], + statistic=method, + bins=[profile_bins, depth_bins], + ) _log.debug(f'dat{np.shape(dat)}') dsout[k] = (('depth', xdimname), dat.T, ds[k].attrs) - # Fill only continuous variables, not QC flags - if 'QC_protocol' not in ds[k].attrs : - dsout[k] = dsout[k].interpolate_na(dim="depth", method="linear", max_gap=50) + if interp_variables is not None: + dsout[k] = interp_variables(dsout[k],ds[k]) # fix u and v, because they should really not be gridded... if ('water_velocity_eastward' in dsout.keys()) and ('u' in profile_meta.keys()): @@ -409,12 +402,6 @@ def make_gridfiles( _log.info('Done gridding') return outname - - - - - - # aliases extract_L0timeseries_profiles = extract_timeseries_profiles