diff --git a/.travis.yml b/.travis.yml index 3eb28ff..514c906 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,35 @@ language: python + matrix: include: - - python: 3.6 - - python: 3.7 + - os: linux dist: xenial + python: 3.6 + - os: linux + dist: xenial + python: 3.7 + +before_install: + - | + sudo apt-get install -y lsb-release + sudo apt-key adv \ + --keyserver keyserver.ubuntu.com \ + --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 + sudo add-apt-repository \ + --yes \ + "deb https://cloud.r-project.org/bin/linux/ubuntu/ $(lsb_release -c -s)-cran36/" + cat /etc/apt/sources.list + sudo apt-get update -qq + sudo apt-get install -y r-base + cache: pip + install: + - pip install -Iv rpy2==3.3.3 - pip install git+https://github.com/JGCRI/gcam_reader - pip install git+https://github.com/JGCRI/tethys - pip install git+https://github.com/JGCRI/xanthos - pip install . + script: - python -m unittest diff --git a/cassandra/compfactory.py b/cassandra/compfactory.py index 3600703..2a84c98 100644 --- a/cassandra/compfactory.py +++ b/cassandra/compfactory.py @@ -11,9 +11,10 @@ 'GcamComponent': comp.GcamComponent, 'FldgenComponent': comp.FldgenComponent, 'HectorStubComponent': comp.HectorStubComponent, + 'TgavStubComponent': comp.TgavStubComponent, 'TethysComponent': comp.TethysComponent, 'XanthosComponent': comp.XanthosComponent, - 'DummyComponent': comp.DummyComponent, + 'DummyComponent': comp.DummyComponent } diff --git a/cassandra/components.py b/cassandra/components.py index bdbf240..37d03d4 100644 --- a/cassandra/components.py +++ b/cassandra/components.py @@ -1052,8 +1052,7 @@ def _read_scen_data(self, scen): from pickle import load data = pkg_resources.resource_filename('cassandra', 'data') - infile = open(join(data, f'hector-outputstream-{scen}.dat'), - 'rb') + infile = open(join(data, f'hector-outputstream-{scen}.dat'),'rb') df = load(infile) infile.close() @@ -1161,3 +1160,390 @@ def run_component(self): def report_test_results(self): """Report the component's results to the unit testing code.""" return self.results[self.name] + + +class TgavStubComponent(ComponentBase): + """Feed in an RDS file of from ESM runs used to train the fldgen emulator. + + This component is used instead of the HectorStubComponent to provide `tgav` to fldgen. + + The component provides two capabilities: + :param Tgav: A data frame containing global mean temperature with fields + ['year', 'scenario', 'variable', 'value', 'units']; where value is 'Tgav' + :type Tgav: Pandas DataFrame + + :param tgav_metadata: A dictionary of metadata for the `Tgav` capability including: + [rds_file, scenario, climate_var_name, source_climate_data, units, + count, mean, median, min, max, std, na_count, null_count, all_finite] + designations + :type tgav_metadata: dict + + + The parameters accepted by this component are: + :param rds_file: Full path with file name and extension to the input RDS emulator file + containing `tgav` outputs for each scenario + :type rds_file: str + + :param climate_var_name: Parent climate model variable name (e.g., tasAdjust) found in the file + name for the supporting climate data used by the emulator + :type climate_var_name: str + + :param scenario: Scenario name (e.g., rcp26) + :type scenario: str + + :param units_field: Unit name from the `tgav` data + :type units_field: str + + :param start_year: Start year of the climate data + :type start_year: int + + :param through_year: Year climate data goes through + :type through_year: int + + """ + + # RDS file variable names + RDS_TGAV_NAME = 'tgav' + RDS_INFILES_NAME = 'infiles' + + # capability name + TGAV_CAPABILITY_NAME = 'Tgav' + + # output field order for the tgav data frame + TGAV_FIELD_ORDER = ['year', 'scenario', 'variable', 'value', 'units'] + + # component expected configuration fields + RDS_FILE_FIELD = 'rds_file' + CLIMATE_VAR_NAME_FIELD = 'climate_var_name' + SCENARIO_FIELD = 'scenario' + UNITS_FIELD = 'units' + START_YEAR_FIELD = 'start_year' + THROUGH_YEAR_FIELD = 'through_year' + + def __init__(self, cap_tbl): + super(TgavStubComponent, self).__init__(cap_tbl) + + # add capabilities to Cassandra + self.addcapability(self.TGAV_CAPABILITY_NAME) + self.addcapability('tgav_metadata') + + def run_component(self): + """Run the TgavStubComponent component""" + + # ensure required params are present + self.validate_params() + + # convert the ListVector object from reading a RDS file to a Python dictionary + rds_dict = self.rds_to_dict() + + # get a dictionary of the target file name and the file index for the specified configuration + target_file_dict = self.build_file_dict(rds_dict) + + # generate a list of years in a realization + year_list = self.build_year_list() + + # generate a list of values for a given variable name and target scenario + tgav_list = self.build_data_list(rds_dict, target_file_dict, year_list) + + # build a pandas data frame to hold tgav output + tgav_df = self.build_dataframe(year_list, tgav_list) + + # report data summary + meta_dict = self.tgav_metadata(tgav_df, target_file_dict) + + # add to cassandra result queue + self.addresults(self.TGAV_CAPABILITY_NAME, tgav_df[self.TGAV_FIELD_ORDER]) + self.addresults('tgav_metadata', meta_dict) + + return 0 + + @staticmethod + def log_raise_exception(exception, message, log_msg=True, raise_msg=True): + """Log an error and raise an exception. + + :param exception: Exception class + :type exception: class object + + :param message: Message to log and raise + :type message: str + + :param log_msg: Optional. Log as error if True; default True + :type log_msg: bool + + :param raise_msg: Optional. Raise exception if True; default True + :type raise_msg: bool + + """ + + if log_msg: + logging.error(message) + + if raise_msg: + raise exception(message) + + def validate_params(self): + """Ensure params are present for this component.""" + + # list of expected params + param_list = [self.RDS_FILE_FIELD, self.CLIMATE_VAR_NAME_FIELD, self.SCENARIO_FIELD, self.UNITS_FIELD, + self.START_YEAR_FIELD, self.THROUGH_YEAR_FIELD] + + for i in param_list: + + if i not in self.params: + msg = f"{self.__class__} Required parameter '{i}' not in config file." + self.log_raise_exception(KeyError, msg) + + def validate_year(self, yr, min_yr=0, max_yr=10000): + """Ensure years are within a reasonable range and are integers. + + :param yr: Target year + :type yr: int + + :param min_yr: Minimum year allowable + :type min_yr: int + + :param max_yr: Maximum year allowable + :type max_yr: int + + :return: [0] validated start year as a four digit integer + [1] validated end year as a four digit integer + + """ + + # validate that the year can be converted to an integer + valid_yr = self.validate_int(yr) + + if (valid_yr < min_yr) or (valid_yr > max_yr): + msg = f"{self.__class__} Year '{valid_yr}' is outside of the reasonable bounds [{min_yr} - {max_yr}]." + self.log_raise_exception(ValueError, msg) + + else: + return valid_yr + + def validate_int(self, value): + """Ensure the value can be converted to an integer. + + :param value: Target value + :type value: int, float, str + + :return: Validated int of value + + """ + + if type(value) is int: + return value + + try: + return int(value) + + except ValueError: + msg = f"{self.__class__} Value '{value}' not able to be converted to an integer as expected." + self.log_raise_exception(ValueError, msg) + + def validate_file_exist(self, file_path): + """Ensure file exists. + + :param file_path: Full path with file name an extension to in input file. + :type file_path: str + + :return: Validated file path + + """ + + if os.path.isfile(file_path): + return file_path + + else: + msg = f"Input file '{file_path}' does not exist." + self.log_raise_exception(FileNotFoundError, msg) + + def rds_to_dict(self): + """Read in and convert an RDS file to a Python dictionary. + + :return: A Python dictionary of {variable name: value arrays} + + """ + + import rpy2.robjects as robjects + from rpy2.robjects import pandas2ri + pandas2ri.activate() + + # get a wrapper around the readRDS R function + read_rds = robjects.r['readRDS'] + + # create ListVector object + rds_file = self.validate_file_exist(self.params[self.RDS_FILE_FIELD]) + lvect = read_rds(rds_file) + + # convert the ListVector object to a Python dictionary + return dict(zip(lvect.names, map(list, list(lvect)))) + + def build_file_dict(self, rds_dict): + """Get a list of target files that match the climate variable specified in the configuration. + + :param rds_dict: A Python dictionary of {variable name: value arrays} derived from the emulator + RDS file. + :type rds_dict: dict + + :return: A dictonary of file names and their corresponding index from the + `infiles` variable from the RDS file that match the user defined + configuration parameters. + Format: {'files': [''], 'file_index': []} + + """ + + try: + infile_list = rds_dict[self.RDS_INFILES_NAME] + except KeyError: + msg = f"Field '{self.RDS_INFILES_NAME}' is not in the RDS dictionary." + self.log_raise_exception(KeyError, msg) + + target_file_dict = {} + + # add file name to dictionary + files = target_file_dict.setdefault('files', []) + + # add file index to dictionary + file_index = target_file_dict.setdefault('file_index', []) + + for index, i in enumerate(infile_list): + + # get file name from path + base = os.path.basename(i) + + # get only files matching search criteria from params + if (self.params[self.SCENARIO_FIELD] in base) \ + and (self.params[self.CLIMATE_VAR_NAME_FIELD] in base) \ + and (str(self.params[self.START_YEAR_FIELD]) in base) \ + and (str(self.params[self.THROUGH_YEAR_FIELD]) in base): + + # add file name to dictionary + files.append(i) + + # add file index to dictionary + file_index.append(index) + + # the number of target files found matching the search criteria + n_files = len(target_file_dict['files']) + + if n_files == 0: + msg = f"{self.__class__} There are no data sets matching the input parameters in file list: {infile_list}. One matching file required." + self.log_raise_exception(ValueError, msg) + + elif n_files > 1: + msg = f"{self.__class__} There are {n_files} data sets matching the input parameters in file list: {infile_list}. One matching file required." + self.log_raise_exception(ValueError, msg) + + else: + return target_file_dict + + def build_year_list(self): + """Construct a list of years that the data provides.""" + + # validate years + start_yr = self.validate_year(self.params[self.START_YEAR_FIELD]) + through_yr = self.validate_year(self.params[self.THROUGH_YEAR_FIELD]) + + return list(range(start_yr, through_yr + 1, 1)) + + def build_data_list(self, rds_dict, target_file_dict, year_list): + """Generate a list of values for a given variable name and target scenario. + + :param rds_dict: A Python dictionary of {variable name: value arrays} derived from the emulator + RDS file. + :type rds_dict: dict + + :param target_file_dict: A dictonary of file names and their corresponding index from the + `infiles` variable from the RDS file that match the user defined + configuration parameters. + Format: {'files': [''], 'file_index': []} + :type target_file_dict: dict + + :param year_list: A list of integer years that encompass the data range per realization. + :type year_list: list + + :return: A list of values for a target parameterization + + """ + + # number of years for a realization + n_years = len(year_list) + + # index of file corresponding to the position of the window containing the data for the target params + file_index = target_file_dict['file_index'][0] + + # start and end index for the data window to extract + start_index = n_years * file_index + end_index = n_years * (file_index + 1) + + return rds_dict[self.RDS_TGAV_NAME][start_index:end_index] + + def build_dataframe(self, year_list, tgav_list): + """Build output data frame for Tgav. + + :param year_list: A list of integer years that encompass the data range per realization. + :type year_list: list + + :param tgav_list: A list of values for a given variable name and target scenario + :type tgav_list: list + + :param target_file_dict: A dictonary of file names and their corresponding index from the + `infiles` variable from the RDS file that match the user defined + configuration parameters. + Format: {'files': [''], 'file_index': []} + :type target_file_dict: dict + + :return: A data frame holding Tgav outputs and required ancillary data + + """ + import pandas as pd + + # build a pandas data frame to hold tgav output + df = pd.DataFrame({'year': year_list, 'value': tgav_list}) + + # additional expected fields + df['scenario'] = self.params[self.SCENARIO_FIELD] + df['variable'] = self.TGAV_CAPABILITY_NAME + df['units'] = self.params[self.UNITS_FIELD] + + return df[self.TGAV_FIELD_ORDER] + + def tgav_metadata(self, df, target_file_dict): + """Create a dictionary holding a data summary about the 'Tgav' dataset and parameter assumptions. + + :param df: A data frame holding Tgav outputs and required ancillary data + :type df: data frame + + :param target_file_dict: A dictonary of file names and their corresponding index from the + `infiles` variable from the RDS file that match the user defined + configuration parameters. + Format: {'files': [''], 'file_index': []} + :type target_file_dict: dict + + :return: A dictionary of metadata for the Tgav data + + """ + + from numpy import isfinite + + meta_dict = {'rds_file': self.params[self.RDS_FILE_FIELD], + 'scenario': self.params[self.SCENARIO_FIELD], + 'climate_var_name': self.params[self.CLIMATE_VAR_NAME_FIELD], + 'source_climate_data': target_file_dict['files'][0], + 'units': self.params[self.UNITS_FIELD], + 'count': df['value'].count(), + 'mean': df['value'].mean(), + 'median': df['value'].median(), + 'min': df['value'].min(), + 'max': df['value'].max(), + 'std': df['value'].std(), + 'na_count': df['value'].isna().sum(), + 'null_count': df['value'].isnull().sum(), + 'all_finite': isfinite(df['value']).all()} + + for k in meta_dict.keys(): + logging.info(f"{self.__class__} 'Tgav' data summary: {k}=={meta_dict[k]}") + print(f"{self.__class__} '{self.TGAV_CAPABILITY_NAME}' data summary: {k}=={meta_dict[k]}") + + return meta_dict diff --git a/cassandra/test/data/fldgen-IPSL-CM5A-LR_test.rds b/cassandra/test/data/fldgen-IPSL-CM5A-LR_test.rds new file mode 100644 index 0000000..9bcead5 Binary files /dev/null and b/cassandra/test/data/fldgen-IPSL-CM5A-LR_test.rds differ diff --git a/cassandra/test/test_tgav_stub.py b/cassandra/test/test_tgav_stub.py new file mode 100644 index 0000000..07d9644 --- /dev/null +++ b/cassandra/test/test_tgav_stub.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +"""Test for the Tgav stub component.""" + +import os +import unittest +import pkg_resources + +from cassandra.components import TgavStubComponent + +# if necessary, set the path to your R_HOME environment variable +# os.environ['R_HOME'] = '/Library/Frameworks/R.framework/Resources' + + +class TestTgavStubComponent(unittest.TestCase): + + def setUp(self): + """Set up a HectorStub component for testing.""" + + capability_table = {} + + # configuration + self.rds_file = pkg_resources.resource_filename('cassandra', 'test/data/fldgen-IPSL-CM5A-LR_test.rds') + self.climate_var_name = 'tasAdjust' + self.scenario = 'rcp26' + self.units = 'Kelvin' + self.start_year = 1861 + self.through_year = 2099 + + # expected output + self.meta_dict = {'rds_file': self.rds_file, + 'scenario': self.scenario, + 'climate_var_name': self.climate_var_name, + 'source_climate_data': './training-data/tasAdjust_annual_IPSL-CM5A-LR_rcp26_18610101-20991231.nc', + 'units': self.units, + 'count': 239, + 'mean' : 286.8046116940164, + 'median': 286.30762280534697, + 'min': 284.78008340211915, + 'max': 288.6382439866686, + 'std': 1.182328423261446, + 'na_count': 0, + 'null_count': 0, + 'all_finite': True} + + # instantiate class + self.stub = TgavStubComponent(capability_table) + + # build parameterization + self.stub.addparam('rds_file', self.rds_file) + self.stub.addparam('climate_var_name', self.climate_var_name) + self.stub.addparam('scenario', self.scenario) + self.stub.addparam('units', self.units) + self.stub.addparam('start_year', self.start_year) + self.stub.addparam('through_year', self.through_year) + self.stub.finalize_parsing() + + # read in RDS file to dictionary + self.rds_dict = self.stub.rds_to_dict() + + # generate target file dictionary + self.target_file_dict = self.stub.build_file_dict(self.rds_dict) + + # generate year list + self.year_list = self.stub.build_year_list() + + # generate tgav list + self.tgav_list = self.stub.build_data_list(self.rds_dict, self.target_file_dict, self.year_list) + + # generate tgav dataframe + self.tgav_df = self.stub.build_dataframe(self.year_list, self.tgav_list) + + def test_log_raise_exception(self): + """Ensure correct exception is raised.""" + + with self.assertRaises(ValueError): + self.stub.log_raise_exception(ValueError, 'value_error', log_msg=False) + + def test_validate_int(self): + """Expect correct exception and type return.""" + + with self.assertRaises(ValueError): + self.stub.validate_int('fail') + + val = self.stub.validate_int('1984') + self.assertTrue(type(val), int) + self.assertEqual(val, 1984) + + def test_validate_year(self): + """Expect correct exception.""" + + # check min bounds error + with self.assertRaises(ValueError): + self.stub.validate_year(-1) + + # check max bounds error + with self.assertRaises(ValueError): + self.stub.validate_year(999999) + + def test_validate_file_exist(self): + """Expect correct exception.""" + + with self.assertRaises(FileNotFoundError): + self.stub.validate_file_exist('/not/a/file.txt') + + fcheck = self.stub.validate_file_exist(self.rds_file) + self.assertEqual(fcheck, self.rds_file) + + def test_rds_to_dict(self): + """Check output dict for data.""" + + # check for keys + self.assertTrue('tgav' in self.rds_dict) + self.assertTrue('infiles' in self.rds_dict) + + # check for data + self.assertEqual(len(self.rds_dict['tgav']), 956) + self.assertEqual(len(self.rds_dict['infiles']), 8) + + def test_build_file_dict(self): + """Ensure correct exception and content.""" + + # check for missing infiles key + with self.assertRaises(KeyError): + self.stub.build_file_dict({}) + + # check for no matching data + with self.assertRaises(ValueError): + self.stub.build_file_dict({'infiles': []}) + + # check for too many matching files + with self.assertRaises(ValueError): + self.stub.build_file_dict({'infiles': ['a', 'a']}) + + # valid outputs + self.assertEqual(self.target_file_dict['files'][0], self.meta_dict['source_climate_data']) + self.assertEqual(self.target_file_dict['file_index'][0], 0) + + def test_build_year_list(self): + """Ensure year list returns correct number of years.""" + + # check type + self.assertTrue(type(self.year_list), list) + + # check first and last year + self.assertEqual(self.year_list[0], self.start_year) + self.assertEqual(self.year_list[-1], self.through_year) + + # check the number of years + self.assertEqual(self.through_year - self.start_year + 1, len(self.year_list)) + + def test_build_data_list(self): + """Check expected data outcome.""" + + self.assertTrue(type(self.tgav_list), list) + + # check number of values for a single scenario and variable + self.assertEqual(len(self.tgav_list), 239) + + def test_build_dataframe(self): + """Confirm fields and shape.""" + + # check data frame shape + self.assertEqual(self.tgav_df.shape, (239, 5)) + + # check column names + self.assertEqual(list(self.tgav_df.columns), ['year', 'scenario', 'variable', 'value', 'units']) + + def test_tgav_metadata(self): + """Test expected output.""" + + meta_dict = self.stub.tgav_metadata(self.tgav_df, self.target_file_dict) + + # check like keys + self.assertEqual(meta_dict.keys(), self.meta_dict.keys()) + + # check value equality + for k in meta_dict.keys(): + self.assertEqual(meta_dict[k], self.meta_dict[k]) + + def test_run_component(self): + """Test expected output.""" + + rval = self.stub.run_component() + + # test run success + self.assertEqual(rval, 0) + + +if __name__ == '__main__': + unittest.main() diff --git a/requirements.txt b/requirements.txt index f757a9e..35fae0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ configobj>=5.0.6 pandas>=0.20 +numpy>=1.16 +rpy2>=2.9