ACCESS-NRI · joshuatorrance · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 15, 2026
diff --git a/splitnc/README.md b/splitnc/README.md
@@ -28,7 +28,7 @@ Newline characters in the file will be treated as whitespace, i.e. newlines can
 
 For example to replicate this command line,
 ```
-python splitnc.py --verbose --overwrite --output-dir /output/directory --shared-vars latitude_longitude --rename-regex "(?P<newname>.+)_\d+" /input/directory/*.nc
+splitnc --verbose --overwrite --output-dir /output/directory --shared-vars latitude_longitude --rename-regex "(?P<newname>.+)_\d+" /input/directory/*.nc
 ```
 the following file could be used;
 ```
@@ -68,6 +68,16 @@ options:
   --rename-regex REGEX  Look for duplicated coordinate names that match the given regex and rename them to the first
                         "newname" capture group in the regex. E.g. "(?P<newname>.*)_\d+" will match "time_0" and rename
                         it to "time".
+  --use-esm1p6-filenames
+                        Use the ESM1.6 filename pattern for the output files:
+                        access-esm1p6.{component}.{dimensions}.{field}.{freq}.{time_cell_method}.{datestamp}.nc
+                        splitnc will attempt to deduce all the components of the filename. If this option is not given
+                        {field}_{original_filename} will be used.
+  --file-freq FILE_FREQ
+                        Specify the frequency of the files (not the data), e.g. if each file contains a month of data
+                        then the file-frequency is '1mon'. Used to determine the resolution of the timestamp for ESM1.6
+                        filenames. Follows the ACCESS frequency vocabulary (e.g. '1yr', '1mon', '1day', '1hr'), any
+                        unrecognised frequency will use the full timestamp. Defaults to '1yr'.
   --output-dir OUTPUT_DIR
                         Output directory for the processed files. If not given output files will be placed in the same
                         directory as the original file.
@@ -89,7 +99,7 @@ Alternatively create a new python environment and install `xarray` and `netCDF4`
 ### Atmosphere
 To use this script for split multi-field atmosphere files from ACCESS-ESM1.6:
 ```bash
-python split-nc.py --shared-vars latitude_longitude  --rename-regex "(?P<newname>.+)_\\d+" $INPUT_DIR/*.nc
+splitnc --shared-vars latitude_longitude  --rename-regex "(?P<newname>.+)_\\d+" $INPUT_DIR/*.nc
 ```
 
 `splitnc` will automatically determine which variables are fields by looking at which variables depend on other variables.
@@ -105,7 +115,7 @@ included in all files even though none of the field variable depend on it.
 ### Ice
 To use this script for split multi-field ice files from ACCESS-ESM1.6:
 ```bash
-python split-nc.py --shared-vars uarea,tmask,tarea --excluded-vars VGRD. $INPUT_DIR/*.nc
+splitnc --shared-vars uarea,tmask,tarea --excluded-vars VGRD. $INPUT_DIR/*.nc
 ```
 
 In comparison to the atmosphere files, ice files have different shared-vars and there are no duplicated variables that require renaming.

diff --git a/splitnc/pyproject.toml b/splitnc/pyproject.toml
@@ -22,6 +22,9 @@ dependencies = [
         "xarray>2025.1.2",
     ]
 
+[project.scripts]
+splitnc = "splitnc:main"
+
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
@@ -30,4 +33,4 @@ requires = [
 
 [tool.pytest.ini_options]
     testpaths = "test"
-    pythonpath = "."
+    pythonpath = "src"
diff --git a/splitnc/src/splitnc/__init__.py b/splitnc/src/splitnc/__init__.py
@@ -0,0 +1 @@
+from .splitnc import *
diff --git a/splitnc/src/splitnc/esm1p6.py b/splitnc/src/splitnc/esm1p6.py
@@ -0,0 +1,154 @@
+import logging
+import re
+
+
+def _build_model():
+    # Model is always access-esm1p6
+    return "access-esm1p6"
+
+
+def _build_component(ds):
+    # Component: either CICE5 or UM7.3
+    source = ds.attrs["source"]
+    if "Los Alamos Sea Ice Model (CICE) Version 5" in source:
+        return "cice5"
+    elif "Data from Met Office Unified Model" in source and \
+        ds.attrs['um_version'] == "7.3":
+        return "um7p3"
+    else:
+        raise ValueError(f"Unknown source, {source}")
+
+
+def _build_dimensions(ds, field_name):
+    # Dimensions: Don't count time when seeing if field is 2d or 3d
+    ndims = len([d for d in ds[field_name].dims if d!='time'])
+    if ndims == 2:
+        return "2d"
+    elif ndims == 3:
+        return "3d"
+    else:
+        raise ValueError(f"Unexpected number for dimensions, {ndims}")
+
+
+def _build_frequency(ds, field_name, input_filepath):
+    # Frequency: use fx if no time dim
+    if 'time' not in ds[field_name].dims:
+        return "fx"
+
+    # Attempt to parse from expected filenames
+    filename = input_filepath.name
+
+    # Define the expected ice filenames
+    # e.g. iceh-2hourly-mean_0272.nc, iceh-1yearly-mean_0272.nc
+    ice_regex = r"iceh-(?P<num>\d+)(?P<unit>yearly|monthly|daily|hourly)-"
+    ice_unit_mapping = {
+        "yearly": "yr",
+        "monthly": "mon",
+        "daily": "day",
+        "hourly": "hr"
+    }
+
+    if match:=re.match(ice_regex, filename):
+        # Extract the frequency number and units for ice files
+        return f"{match['num']}{ice_unit_mapping[match['unit']]}"
+    elif "_mon.nc" in filename:
+        # Match the monthly pattern for atmosphere files
+        return "1mon"
+    elif "_dai.nc" in filename:
+        # Match the daily pattern for atmosphere files
+        return "1day"
+    elif match:=re.match(r".+_(\d+hr).nc", filename):
+        # Get the frequency from the atmosphere regex match for Xhr
+        return match[1]
+    elif "aiihca.pc" in filename:
+        # Match another pattern for hourly atmosphere files
+        return "1hr"
+
+    # No sub-hourly frequency data expected
+    raise ValueError("Unable to deduce frequency from filename")
+
+
+
+def _build_cell_method(ds, field_name):
+    attrs = ds[field_name].attrs
+
+    try:
+        if attrs['time_rep'] == "instantaneous":
+            # ice files sometimes have time_rep = instantaneous but not
+            # cell_methods = time: point
+            return ".snap"
+    except KeyError:
+        # Continue if 'time_rep' not in attrs
+        pass
+
+    # Time cell_method: Should be able to deduce from the cell_method
+    cell_method_regx = r"time: (\w+)"
+    try:    
+        if m:= re.search(cell_method_regx, attrs["cell_methods"]):
+            method = m[1]
+            if method == "point":
+                method = "snap"
+
+            # Since this element is optional add the . here
+            return "." + method
+    except KeyError:
+        # Continue if 'cell_methods' not in attrs
+        pass
+
+    # Otherwise omit this element from the filename
+    return ""
+
+
+def _build_datestamp(ds, field_name, file_freq):
+    if 'time' not in ds[field_name].dims:
+        # No datetime for fixed files
+        return ""
+
+    # Truncate average time val by output file frequency
+    # datetimes do not correctly zero-pad so need to use %4Y
+    if re.match(r'\d+(yr|dec)', file_freq):
+        fmt = '%4Y'
+    elif re.match(r'\d+mon', file_freq):
+        fmt = '%4Y-%m'
+    elif re.match(r'\d+day', file_freq):
+        fmt = '%4Y-%m-%d'
+    else:
+        fmt = '%4Y-%m-%dT%H:%M:%S'
+
+    # Get the appropriately truncated datetime for the average time
+    try:
+        # Try the time bounds
+        time_arr = ds[ds['time'].attrs["bounds"]]
+        logging.debug("Using time bounds to calculate filename timestamp")
+    except KeyError:
+        # If there are no time bounds just use time
+        logging.debug("Unable to find time bounds, using time to calculate filename timestamp")
+        time_arr = ds['time']
+
+    # Calculate the middle point
+    first, last = time_arr.min(), time_arr.max()
+    datestamp_dt = (first + (last - first) / 2).dt
+
+    return "." + datestamp_dt.strftime(fmt).data.flatten()[0]
+
+
+def build_esm1p6_filename(ds, field_name, input_filepath, esm1p6_filename=False, file_freq="1yr"):
+    template = "{model}.{component}.{dimensions}.{field}.{freq}{time_cell_method}{datestamp}.nc"
+
+    # Model is always access-esm1p6
+    try:
+        d = {
+            "model": _build_model(),
+            "component": _build_component(ds),
+            "dimensions": _build_dimensions(ds, field_name),
+            "field": field_name,
+            "freq": _build_frequency(ds, field_name, input_filepath),
+            "time_cell_method": _build_cell_method(ds, field_name),
+            "datestamp": _build_datestamp(ds, field_name, file_freq),
+        }
+    except ValueError as e:
+        # Reraise the exception with some extra information
+        e.args = (*e.args, f"While building output filename for field {field_name} and {input_filepath}")
+        raise
+
+    return template.format(**d)
diff --git a/splitnc/splitnc.py → splitnc/src/splitnc/splitnc.py b/splitnc/splitnc.py → splitnc/src/splitnc/splitnc.py
@@ -10,6 +10,8 @@
 
 import xarray as xr
 
+from splitnc.esm1p6 import build_esm1p6_filename
+
 
 def determine_field_vars(ds):
     """
@@ -232,6 +234,25 @@ def update_history_attr(ds, new_history):
     ds.attrs["history"] = old_history + new_history
 
 
+def build_filename(ds, field_name, input_filepath, esm1p6_filename=False, file_freq="1yr"):
+    """
+    Build the filename used for the output.
+
+    If esm1p6_filename=False then <field_name>_<orginal_file_name> will be used.
+
+    Otherwise a filename that follows the ESM1.6 naming scheme will be used:
+    {model}.{component}.{dimensions}.{field}.{freq}.{time_cell_method}.{datestamp}.nc
+    More info here: https://access-om3-configs.access-hive.org.au/configurations/Ocean_diagnostics/
+    Elements of this schema will be deduced from the Dataset, the original filename,
+    and the given output file frequency.
+    """
+    if esm1p6_filename:
+        return build_esm1p6_filename(ds, field_name, input_filepath,
+            esm1p6_filename=esm1p6_filename, file_freq=file_freq)
+    else:
+        return f"{field_name}_{input_filepath.name}"
+
+
 def process_file(
     filepath,
     field_vars=None,
@@ -241,12 +262,14 @@ def process_file(
     output_dir=None,
     overwrite=False,
     update_history=True,
+    esm1p6_filename=False,
+    file_freq="1yr",
 ):
     logging.debug(f"Processing {filepath}")
     filepath = Path(filepath)
 
     # Use cftime to suppress warnings
-    decoder = xr.coders.CFDatetimeCoder(use_cftime=True)
+    decoder = xr.coders.CFDatetimeCoder(time_unit='us')
     with xr.open_dataset(filepath, decode_times=decoder) as ds:
         # Resolve any regex in the excluded_vars list
         if excluded_vars:
@@ -342,22 +365,36 @@ def process_file(
             else:
                 output_dir = filepath.parent
 
-            output_filename = output_dir / f"{v}_{filepath.name}"
-            logging.debug(f"Output filepath is {output_filename}")
-
-            if not overwrite and output_filename.exists():
-                logging.error(f"Output file already exists - {output_filename}")
+            # Build the output filepath
+            filename = build_filename(
+                ds=ds_v,
+                field_name=v,
+                input_filepath=filepath,
+                esm1p6_filename=esm1p6_filename,
+                file_freq=file_freq,
+            )
+            output_filepath = output_dir / filename
+            logging.debug(f"Output filepath is {output_filepath}")
+
+            # Write to file
+            if not overwrite and output_filepath.exists():
+                logging.error(f"Output file already exists - {output_filepath}")
                 logging.error("Use --overwrite to overwrite existing files")
 
-                raise FileExistsError(f"{output_filename} already exists")
+                raise FileExistsError(f"{output_filepath} already exists")
 
             logging.debug("Creating parent directory and writing to output file")
-            output_filename.parent.mkdir(parents=True, exist_ok=True)
-            ds_v.to_netcdf(output_filename)
+            output_filepath.parent.mkdir(parents=True, exist_ok=True)
+            ds_v.to_netcdf(output_filepath)
 
 
 #### Main
 def arg_parse(cmdline_args=None):
+    # If -c/--command-line-file is being used then all other args are ignored
+    # This affects which are "required" (or nargs for filepaths)
+    args = sys.argv if cmdline_args is None else cmdline_args
+    cmd_file_arg_present = "-c" in args or "--command-line-file" in args
+
     parser = argparse.ArgumentParser(
         prog="splitnc",
         description="Splits a multi-field netCDF file into separate one-field files",
@@ -384,7 +421,7 @@ def globbable_string_list(string_list):
     # required and --cmd-line-file can be used on it's own
     parser.add_argument(
         "filepaths",
-        nargs="*",
+        nargs="*" if cmd_file_arg_present else "+",
         default=[],
         type=globbable_string_list,
         help="One or more filepaths to process",
@@ -425,6 +462,24 @@ def globbable_string_list(string_list):
         'regex. E.g. "(?P<newname>.*)_\\d+" will match "time_0" and '
         'rename it to "time".',
     )
+    parser.add_argument(
+        "--use-esm1p6-filenames",
+        action="store_true",
+        help="Use the ESM1.6 filename pattern for the output files: "
+        "access-esm1p6.{component}.{dimensions}.{field}.{freq}.{time_cell_method}.{datestamp}.nc"
+        " splitnc will attempt to deduce all the components of the filename. "
+        "If this option is not given {field}_{original_filename} will be used."
+    )
+    parser.add_argument(
+        "--file-freq",
+        default="1yr",
+        help="Specify the frequency of the files (not the data), e.g. if each "
+        "file contains a month of data then the file-frequency is '1mon'. Used "
+        "to determine the resolution of the timestamp for ESM1.6 filenames. "
+        "Follows the ACCESS frequency vocabulary (e.g. '1yr', '1mon', '1day', "
+        "'1hr'), any unrecognised frequency will use the full timestamp. "
+        "Defaults to '1yr'."
+    )
     parser.add_argument(
         "--output-dir",
         help="Output directory for the processed files. If not given output "
@@ -496,6 +551,8 @@ def main():
             output_dir=args.output_dir,
             overwrite=args.overwrite,
             update_history=not args.dont_update_history,
+            esm1p6_filename=args.use_esm1p6_filenames,
+            file_freq=args.file_freq,
         )
 
 

diff --git a/splitnc/test/common.py b/splitnc/test/common.py
@@ -1,6 +1,5 @@
 import os
 from pathlib import Path
-import pytest
 import shlex
 import subprocess
 
@@ -18,8 +17,9 @@ def runcmd(cmd, wd=None, env=None):
     )
 
 
-def make_nc(tmp_path, cdl_file, filename="test.nc"):
-    filepath = f"{tmp_path}/{filename}"
+def make_nc(tmp_path, cdl_file):
+    nc_filename = Path(cdl_file).with_suffix(".nc").name
+    filepath = f"{tmp_path}/{nc_filename}"
     cmd = f"ncgen -o {filepath}  {cdl_file}"
 
     runcmd(cmd)

diff --git a/splitnc/test/data/aiihca.pa-234501_mon.cdl b/splitnc/test/data/aiihca.pa-234501_mon.cdl
@@ -1890,6 +1890,16 @@ variables:
 		fld_s33i002:cell_methods = "time: mean" ;
 		fld_s33i002:grid_mapping = "latitude_longitude" ;
 		fld_s33i002:coordinates = "sigma_theta surface_altitude theta_level_height" ;
+	float fld_artificial(time, model_theta_level_number, lat, lon) ;
+		fld_artificial:_FillValue = 1.e+20f ;
+		fld_artificial:long_name = "ATM TRACER  2               AFTER TS" ;
+		fld_artificial:um_stash_source = "m01s33i002" ;
+		fld_artificial:missing_value = 1.e+20f ;
+		fld_artificial:cell_methods = "time: point" ;
+		fld_artificial:grid_mapping = "latitude_longitude" ;
+		fld_artificial:coordinates = "sigma_theta surface_altitude theta_level_height" ;
+		fld_artificial:notes = "this variable was added manually to test 'time: point' filenames" ;
+
 
 // global attributes:
 		:history = "File /scratch/p66/jxs599/access-esm/archive/Nov25-NewNitrogen-Nov25-NewNitrogen-6a5acd30/output200/atmosphere/aiihca.pai5jan converted with /g/data/vk83/apps/base_conda/envs/payu-1.2.0/lib/python3.10/site-packages/um2nc/um2netcdf.py 1.1.0 at 2025-12-04 20:03:16" ;