From 9ed6ff6200f9d2dd789b16f746dbba402ba8cd70 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:24:15 -0400
Subject: [PATCH 01/18] add tgav component to factory

---
 cassandra/compfactory.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cassandra/compfactory.py b/cassandra/compfactory.py
index 3600703..2a84c98 100644
--- a/cassandra/compfactory.py
+++ b/cassandra/compfactory.py
@@ -11,9 +11,10 @@
     'GcamComponent': comp.GcamComponent,
     'FldgenComponent': comp.FldgenComponent,
     'HectorStubComponent': comp.HectorStubComponent,
+    'TgavStubComponent': comp.TgavStubComponent,
     'TethysComponent': comp.TethysComponent,
     'XanthosComponent': comp.XanthosComponent,
-    'DummyComponent': comp.DummyComponent,
+    'DummyComponent': comp.DummyComponent
 }
 
 
From 4a63ab67ff7020af5b5931fd7b66bda73b2317f2 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:26:05 -0400
Subject: [PATCH 02/18] setup tgav component

---
 cassandra/components.py | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/cassandra/components.py b/cassandra/components.py
index bdbf240..42f4cae 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1052,8 +1052,7 @@ def _read_scen_data(self, scen):
         from pickle import load
 
         data = pkg_resources.resource_filename('cassandra', 'data')
-        infile = open(join(data, f'hector-outputstream-{scen}.dat'),
-                      'rb')
+        infile = open(join(data, f'hector-outputstream-{scen}.dat'),'rb')
         df = load(infile)
         infile.close()
 
@@ -1161,3 +1160,40 @@ def run_component(self):
     def report_test_results(self):
         """Report the component's results to the unit testing code."""
         return self.results[self.name]
+
+
+class TgavStubComponent(ComponentBase):
+    """Feed in external time series of temperature data to provide to fldgen to
+    produce new realizations.
+
+
+    """
+
+    # RDS file variable names
+    RDS_TGAV_NAME = 'tgav'
+    RDS_INFILES_NAME = 'infiles'
+
+    # capability name
+    TGAV_CAPABILITY_NAME = 'Tgav'
+
+    # output field order for the tgav data frame
+    TGAV_FIELD_ORDER = ['year', 'scenario', 'variable', 'value', 'units']
+
+    # component expected configuration fields
+    RDS_FILE_FIELD = 'rds_file'
+    CLIMATE_VAR_NAME_FIELD = 'climate_var_name'
+    SCENARIO_FIELD = 'scenario'
+    UNITS_FIELD = 'units'
+
+    def __init__(self, cap_tbl):
+        super(TgavStubComponent, self).__init__(cap_tbl)
+        self.addcapability(self.TGAV_CAPABILITY_NAME)
+
+    def run_component(self):
+        """Run the TgavStubComponent component
+
+        Load the requested scenarios and make each variable available to the
+        rest of the system.
+
+        """
+        import pandas as pd

From 0d793a20fd9fb8dceee9cacbcdf4cc222a9fd08f Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:27:10 -0400
Subject: [PATCH 03/18] validate parameters method

---
 cassandra/components.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/cassandra/components.py b/cassandra/components.py
index 42f4cae..38ddec3 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1197,3 +1197,17 @@ def run_component(self):
 
         """
         import pandas as pd
+
+    def validate_params(self):
+        """Ensure params are present for this component."""
+
+        # list of expected params
+        param_list = [self.RDS_FILE_FIELD, self.CLIMATE_VAR_NAME_FIELD, self.SCENARIO_FIELD, self.UNITS_FIELD]
+
+        for i in param_list:
+
+            if i not in self.params:
+                msg = f"{self.__class__} Required parameter '{i}' not in config file."
+                logging.error(msg)
+                raise KeyError(msg)
+    

From 09d6d5fde96fa79616b6318671b68e0976e33a53 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:27:55 -0400
Subject: [PATCH 04/18] validate year method

---
 cassandra/components.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/cassandra/components.py b/cassandra/components.py
index 38ddec3..752a182 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1210,4 +1210,31 @@ def validate_params(self):
                 msg = f"{self.__class__} Required parameter '{i}' not in config file."
                 logging.error(msg)
                 raise KeyError(msg)
-    
+
+    def validate_year(self, yr, min_yr=0, max_yr=10000):
+        """Ensure years are within a reasonable range and are integers.
+
+        :param yr:                      Target year
+        :type yr:                       int
+
+        :param min_yr:                  Minimum year allowable
+        :type min_yr:                   int
+
+        :param max_yr:                  Maximum year allowable
+        :type max_yr:                   int
+
+        :return:                        [0] validated start year as a four digit integer
+                                        [1] validated end year as a four digit integer
+
+        """
+
+        # validate that the year can be converted to an integer
+        valid_yr = self.validate_int(yr)
+
+        if (valid_yr < min_yr) or (valid_yr > max_yr):
+            msg = f"{self.__class__} Year '{valid_yr}' is outside of the reasonable bounds [{min_yr} - {max_yr}]."
+            logging.error(msg)
+            raise ValueError(msg)
+
+        else:
+            return valid_yr

From 1774b6a17b4f89d114992e262a2e44a0849dee1c Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:28:24 -0400
Subject: [PATCH 05/18] validate integer method

---
 cassandra/components.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/cassandra/components.py b/cassandra/components.py
index 752a182..1baa546 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1238,3 +1238,24 @@ def validate_year(self, yr, min_yr=0, max_yr=10000):
 
         else:
             return valid_yr
+
+    def validate_int(self, value):
+        """Ensure the value can be converted to an integer.
+
+        :param value:                   Target value
+        :type value:                    int, float, str
+
+        :return:                        Validated int of value
+
+        """
+
+        if type(value) == int:
+            return value
+
+        try:
+            return int(value)
+
+        except TypeError:
+            msg = f"{self.__class__} Value '{value}' not able to be converted to an integer as expected."
+            logging.error(msg)
+            raise TypeError(msg)

From 0a7c21f0d4a3b3337f1fcb501ed8a87f6ebeb512 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:29:11 -0400
Subject: [PATCH 06/18] method to read rds into dict

---
 cassandra/components.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/cassandra/components.py b/cassandra/components.py
index 1baa546..aeebcbc 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1259,3 +1259,22 @@ def validate_int(self, value):
             msg = f"{self.__class__} Value '{value}' not able to be converted to an integer as expected."
             logging.error(msg)
             raise TypeError(msg)
+
+    def rds_to_dict(self):
+        """Read in and convert an RDS file to a Python dictionary.
+
+        :return:                        A Python dictionary of {variable name: value arrays}
+
+        """
+        import rpy2.robjects as robjects
+        from rpy2.robjects import pandas2ri
+        pandas2ri.activate()
+
+        # get a wrapper around the readRDS R function
+        read_rds = robjects.r['readRDS']
+
+        # create ListVector object
+        lvect = read_rds(self.params[self.RDS_FILE_FIELD])
+
+        # convert the ListVector object to a Python dictionary
+        return dict(zip(lvect.names, map(list, list(lvect))))

From 2d96a893a46a45b27aab1b2ca71ce6b8823e9d54 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:30:16 -0400
Subject: [PATCH 07/18] method to get a list of climate files supporting
 emulator

---
 cassandra/components.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/cassandra/components.py b/cassandra/components.py
index aeebcbc..a9a4cb5 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1278,3 +1278,25 @@ def rds_to_dict(self):
 
         # convert the ListVector object to a Python dictionary
         return dict(zip(lvect.names, map(list, list(lvect))))
+
+    def get_files(self, rds_dict):
+        """Get a list of target files that match the climate variable specified in the configuration.
+
+        :param rds_dict:                A Python dictionary of {variable name: value arrays} derived from the emulator
+                                        RDS file.
+        :type rds_dict:                 dict
+
+        :return:                        A list of file names from the `infiles` variable from the RDS file that match
+                                        the user defined climate variable name.
+
+        """
+
+        target_files = [i for i in rds_dict[self.RDS_INFILES_NAME] if os.path.basename(i).split('_')[0] == self.params[self.CLIMATE_VAR_NAME_FIELD]]
+
+        if len(target_files) == 0:
+            msg = f"{self.__class__} There are no datasets matching `climate_var_name` == {self.params[self.CLIMATE_VAR_NAME_FIELD]}"
+            logging.error(msg)
+            raise ValueError(msg)
+
+        else:
+            return target_files

From bf93d3afbe41c5fd46452c0493f88c8a6468278b Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:30:46 -0400
Subject: [PATCH 08/18] generate year list method

---
 cassandra/components.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cassandra/components.py b/cassandra/components.py
index a9a4cb5..4491b96 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1300,3 +1300,28 @@ def get_files(self, rds_dict):
 
         else:
             return target_files
+
+    def generate_year_list(self, target_file):
+        """Generate a list of years that encompass the data range per realization from a parsed a file name.
+
+        :param target_file:             A file name from the `infiles` variable from the RDS file that match
+                                        the user defined climate variable name.
+        :type target_file:              str
+
+        :return:                        A list of integer years that encompass the data range per realization.
+
+        """
+
+        # get a year list for each file from the file name
+        yrs = os.path.basename(target_file).split('_')[-1].split('.')[0].split('-')
+        start_yr = self.validate_year(yrs[0][:4])
+        through_yr = self.validate_year(yrs[1][:4])
+
+        # ensure start year is not greater than through year
+        if start_yr > through_yr:
+            msg = f"{self.__class__} Start year '{start_yr}' > through year '{through_yr}' for emulator file '{target_file}'"
+            logging.error(msg)
+            raise ValueError(msg)
+
+        # create a list of years found in each scenario
+        return list(range(start_yr, through_yr + 1, 1))

From 752347ee6b348bb84133e2204713e7bd434ed424 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:31:35 -0400
Subject: [PATCH 09/18] build data list method

---
 cassandra/components.py | 59 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/cassandra/components.py b/cassandra/components.py
index 4491b96..72105ab 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1325,3 +1325,62 @@ def generate_year_list(self, target_file):
 
         # create a list of years found in each scenario
         return list(range(start_yr, through_yr + 1, 1))
+
+    def build_data_list(self, rds_dict, target_files, year_list, target_scenario):
+        """Generate a list of values for a given variable name and target scenario.
+
+        :param rds_dict:                A Python dictionary of {variable name: value arrays} derived from the emulator
+                                        RDS file.
+        :type rds_dict:                 dict
+
+        :param target_files:            A list of file names from the `infiles` variable from the RDS file that match
+                                        the user defined climate variable name.
+        :type target_files:             list
+
+        :param year_list:               A list of integer years that encompass the data range per realization.
+        :type year_list:                list
+
+        :param target_scenario:         The scenario name (e.g., rcp26)
+        :type target_scenario:          str
+
+        :return:                        A list of values for a given variable name and target scenario.
+
+        """
+
+        # get the number of years
+        len_yr_list = len(year_list)
+
+        d_scenario_tgav = {}
+        for idx, i in enumerate(target_files):
+
+            # split path by delim
+            f_split = os.path.basename(i).split('_')
+
+            # scenario name from file name
+            scn_name = f_split[3]
+
+            # get first files ending index value for slicing out data per year
+            if idx == 0:
+                start_idx = 0
+                end_idx = len_yr_list
+            else:
+                start_idx = end_idx
+                end_idx += len_yr_list
+
+            # add tgav data to scenario dict
+            if scn_name not in d_scenario_tgav:
+                d_scenario_tgav[scn_name] = rds_dict[self.RDS_TGAV_NAME][start_idx:end_idx]
+
+            else:
+                msg = f"{self.__class__} Multiple scenarios in target files for {scn_name}."
+                logging.error(msg)
+                raise KeyError(msg)
+
+        # ensure the target scenario is in the dictionary
+        if target_scenario not in d_scenario_tgav:
+            msg = f"{self.__class__} Scenario '{target_scenario}' is not in the RDS supporting climate file options: '{d_scenario_tgav.keys()}'"
+            logging.error(msg)
+            raise KeyError(msg)
+
+        else:
+            return d_scenario_tgav[target_scenario]

From 74f449dc81aee4291c3c2fbbd6ebc29ce995abc0 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Thu, 11 Jun 2020 17:43:56 -0400
Subject: [PATCH 10/18] update run component method and docs

---
 cassandra/components.py | 54 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/cassandra/components.py b/cassandra/components.py
index 72105ab..60e08b8 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1163,9 +1163,30 @@ def report_test_results(self):
 
 
 class TgavStubComponent(ComponentBase):
-    """Feed in external time series of temperature data to provide to fldgen to
-    produce new realizations.
+    """Feed in an RDS file of from ESM runs used to train the fldgen emulator.
 
+    This component is used instead of the HectorStubComponent to provide `tgav` to fldgen.
+
+    The component provides one capability:
+      * Tgav   : global mean temperature
+
+    The `Tgav` capability returns a data frame with data from the target scenario.
+
+    The parameters accepted by this component are:
+
+    :param rds_file:                    Full path with file name and extension to the input RDS emulator file
+                                        containing `tgav` outputs for each scenario
+    :type rds_file:                     str
+
+    :param climate_var_name:            Parent climate model variable name (e.g., tasAdjust) found in the file
+                                        name for the supporting climate data used by the emulator
+    :type climate_var_name:             str
+
+    :param scenario:                    Scenario name (e.g., rcp26)
+    :type scenario:                     str
+
+    :param units_field:                 Unit name from the `tgav` data
+    :type units_field:                  str
 
     """
 
@@ -1198,6 +1219,35 @@ def run_component(self):
         """
         import pandas as pd
 
+        # ensure required params are present
+        self.validate_params()
+
+        # convert the ListVector object from reading a RDS file to a Python dictionary
+        rds_dict = self.rds_to_dict()
+
+        # get a list of target files that match the climate variable specified in the configuration
+        target_files = self.get_files(rds_dict)
+
+        # generate a list of years in a realization
+        year_list = self.generate_year_list(target_files[0])
+
+        # generate a dictionary of {scenario:  data_array} for a tgav for a valid scenario
+        tgav_list = self.build_data_list(rds_dict, target_files, year_list, self.params[self.SCENARIO_FIELD])
+
+        # build a pandas data frame to hold tgav output
+        tgav_df = pd.DataFrame({'year': year_list,
+                                'value': tgav_list})
+
+        # additional expected fields
+        tgav_df['scenario'] = self.params[self.SCENARIO_FIELD]
+        tgav_df['variable'] = self.TGAV_CAPABILITY_NAME
+        tgav_df['units'] = self.params[self.UNITS_FIELD]
+
+        # add to cassandra result queue
+        self.addresults(self.TGAV_CAPABILITY_NAME, tgav_df[self.TGAV_FIELD_ORDER])
+
+        return 0
+
     def validate_params(self):
         """Ensure params are present for this component."""
 

From a2cdf5c342355a43f9b21a24ef5e32014ba09dd3 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 11:27:21 -0400
Subject: [PATCH 11/18] streamline tgav component and add metadata validation

---
 cassandra/components.py | 226 ++++++++++++++++++++++++----------------
 1 file changed, 139 insertions(+), 87 deletions(-)

diff --git a/cassandra/components.py b/cassandra/components.py
index 60e08b8..a7c6055 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1167,13 +1167,19 @@ class TgavStubComponent(ComponentBase):
 
     This component is used instead of the HectorStubComponent to provide `tgav` to fldgen.
 
-    The component provides one capability:
-      * Tgav   : global mean temperature
+    The component provides two capabilities:
+    :param Tgav:                        A data frame containing global mean temperature with fields
+                                        ['year', 'scenario', 'variable', 'value', 'units']; where value is 'Tgav'
+    :type Tgav:                         Pandas DataFrame
 
-    The `Tgav` capability returns a data frame with data from the target scenario.
+    :param tgav_metadata:               A dictionary of metadata for the `Tgav` capability including:
+                                        [rds_file, scenario, climate_var_name, source_climate_data, units,
+                                        count, mean, median, min, max, std, na_count, null_count, all_finite]
+                                        designations
+    :type tgav_metadata:                dict
 
-    The parameters accepted by this component are:
 
+    The parameters accepted by this component are:
     :param rds_file:                    Full path with file name and extension to the input RDS emulator file
                                         containing `tgav` outputs for each scenario
     :type rds_file:                     str
@@ -1188,6 +1194,12 @@ class TgavStubComponent(ComponentBase):
     :param units_field:                 Unit name from the `tgav` data
     :type units_field:                  str
 
+    :param start_year:                  Start year of the climate data
+    :type start_year:                   int
+
+    :param through_year:                Year climate data goes through
+    :type through_year:                 int
+
     """
 
     # RDS file variable names
@@ -1205,18 +1217,18 @@ class TgavStubComponent(ComponentBase):
     CLIMATE_VAR_NAME_FIELD = 'climate_var_name'
     SCENARIO_FIELD = 'scenario'
     UNITS_FIELD = 'units'
+    START_YEAR_FIELD = 'start_year'
+    THROUGH_YEAR_FIELD = 'through_year'
 
     def __init__(self, cap_tbl):
         super(TgavStubComponent, self).__init__(cap_tbl)
+
+        # add capabilities to Cassandra
         self.addcapability(self.TGAV_CAPABILITY_NAME)
+        self.addcapability('tgav_metadata')
 
     def run_component(self):
-        """Run the TgavStubComponent component
-
-        Load the requested scenarios and make each variable available to the
-        rest of the system.
-
-        """
+        """Run the TgavStubComponent component"""
         import pandas as pd
 
         # ensure required params are present
@@ -1225,19 +1237,22 @@ def run_component(self):
         # convert the ListVector object from reading a RDS file to a Python dictionary
         rds_dict = self.rds_to_dict()
 
-        # get a list of target files that match the climate variable specified in the configuration
-        target_files = self.get_files(rds_dict)
+        # get a dictionary of the target file name and the file index for the specified configuration
+        target_file_dict = self.build_file_dict(rds_dict)
 
         # generate a list of years in a realization
-        year_list = self.generate_year_list(target_files[0])
+        year_list = self.build_year_list()
 
-        # generate a dictionary of {scenario:  data_array} for a tgav for a valid scenario
-        tgav_list = self.build_data_list(rds_dict, target_files, year_list, self.params[self.SCENARIO_FIELD])
+        # generate a list of values for a given variable name and target scenario
+        tgav_list = self.build_data_list(rds_dict, target_file_dict, year_list)
 
         # build a pandas data frame to hold tgav output
         tgav_df = pd.DataFrame({'year': year_list,
                                 'value': tgav_list})
 
+        # report data summary
+        meta_dict = self.tgav_metadata(tgav_df, target_file_dict)
+
         # additional expected fields
         tgav_df['scenario'] = self.params[self.SCENARIO_FIELD]
         tgav_df['variable'] = self.TGAV_CAPABILITY_NAME
@@ -1245,21 +1260,46 @@ def run_component(self):
 
         # add to cassandra result queue
         self.addresults(self.TGAV_CAPABILITY_NAME, tgav_df[self.TGAV_FIELD_ORDER])
+        self.addresults('tgav_metadata', meta_dict)
 
         return 0
 
+    @staticmethod
+    def log_raise_exception(exception, message, log_msg=True, raise_msg=True):
+        """Log an error and raise an exception.
+
+        :param exception:               Exception class
+        :type exception:                class object
+
+        :param message:                 Message to log and raise
+        :type message:                  str
+
+        :param log_msg:                 Optional.  Log as error if True; default True
+        :type log_msg:                  bool
+
+        :param raise_msg:               Optional. Raise exception if True; default True
+        :type raise_msg:                bool
+
+        """
+
+        if log_msg:
+            logging.error(message)
+
+        if raise_msg:
+            raise exception(message)
+
     def validate_params(self):
         """Ensure params are present for this component."""
 
         # list of expected params
-        param_list = [self.RDS_FILE_FIELD, self.CLIMATE_VAR_NAME_FIELD, self.SCENARIO_FIELD, self.UNITS_FIELD]
+        param_list = [self.RDS_FILE_FIELD, self.CLIMATE_VAR_NAME_FIELD, self.SCENARIO_FIELD, self.UNITS_FIELD,
+                      self.START_YEAR_FIELD, self.THROUGH_YEAR_FIELD]
 
         for i in param_list:
 
             if i not in self.params:
                 msg = f"{self.__class__} Required parameter '{i}' not in config file."
-                logging.error(msg)
-                raise KeyError(msg)
+                self.log_raise_exception(KeyError, msg)
 
     def validate_year(self, yr, min_yr=0, max_yr=10000):
         """Ensure years are within a reasonable range and are integers.
@@ -1283,8 +1323,7 @@ def validate_year(self, yr, min_yr=0, max_yr=10000):
 
         if (valid_yr < min_yr) or (valid_yr > max_yr):
             msg = f"{self.__class__} Year '{valid_yr}' is outside of the reasonable bounds [{min_yr} - {max_yr}]."
-            logging.error(msg)
-            raise ValueError(msg)
+            self.log_raise_exception(ValueError, msg)
 
         else:
             return valid_yr
@@ -1299,7 +1338,7 @@ def validate_int(self, value):
 
         """
 
-        if type(value) == int:
+        if type(value) is int:
             return value
 
         try:
@@ -1307,8 +1346,7 @@ def validate_int(self, value):
 
         except TypeError:
             msg = f"{self.__class__} Value '{value}' not able to be converted to an integer as expected."
-            logging.error(msg)
-            raise TypeError(msg)
+            self.log_raise_exception(TypeError, msg)
 
     def rds_to_dict(self):
         """Read in and convert an RDS file to a Python dictionary.
@@ -1316,6 +1354,7 @@ def rds_to_dict(self):
         :return:                        A Python dictionary of {variable name: value arrays}
 
         """
+
         import rpy2.robjects as robjects
         from rpy2.robjects import pandas2ri
         pandas2ri.activate()
@@ -1329,108 +1368,121 @@ def rds_to_dict(self):
         # convert the ListVector object to a Python dictionary
         return dict(zip(lvect.names, map(list, list(lvect))))
 
-    def get_files(self, rds_dict):
+    def build_file_dict(self, rds_dict):
         """Get a list of target files that match the climate variable specified in the configuration.
 
         :param rds_dict:                A Python dictionary of {variable name: value arrays} derived from the emulator
                                         RDS file.
         :type rds_dict:                 dict
 
-        :return:                        A list of file names from the `infiles` variable from the RDS file that match
-                                        the user defined climate variable name.
+        :return:                        A dictonary of file names and their corresponding index from the
+                                        `infiles` variable from the RDS file that match the user defined
+                                        configuration parameters.
+                                        Format: {'files': ['<file path>'], 'file_index': [<file index>]}
 
         """
 
-        target_files = [i for i in rds_dict[self.RDS_INFILES_NAME] if os.path.basename(i).split('_')[0] == self.params[self.CLIMATE_VAR_NAME_FIELD]]
+        target_file_dict = {}
+        for index, i in enumerate(rds_dict[self.RDS_INFILES_NAME]):
 
-        if len(target_files) == 0:
-            msg = f"{self.__class__} There are no datasets matching `climate_var_name` == {self.params[self.CLIMATE_VAR_NAME_FIELD]}"
-            logging.error(msg)
-            raise ValueError(msg)
+            # get file name from path
+            base = os.path.basename(i)
 
-        else:
-            return target_files
+            # get only files matching search criteria from params
+            if (self.params[self.SCENARIO_FIELD] in base) \
+                    and (self.params[self.CLIMATE_VAR_NAME_FIELD] in base) \
+                    and (str(self.params[self.START_YEAR_FIELD]) in base) \
+                    and (str(self.params[self.THROUGH_YEAR_FIELD]) in base):
 
-    def generate_year_list(self, target_file):
-        """Generate a list of years that encompass the data range per realization from a parsed a file name.
+                # add file name to dictionary
+                target_file_dict.setdefault('files', []).append(i)
 
-        :param target_file:             A file name from the `infiles` variable from the RDS file that match
-                                        the user defined climate variable name.
-        :type target_file:              str
+                # add file index to dictionary
+                target_file_dict.setdefault('file_index', []).append(index)
 
-        :return:                        A list of integer years that encompass the data range per realization.
+        # the number of target files found matching the search criteria
+        n_files = len(target_file_dict['files'])
 
-        """
+        if n_files == 0:
+            msg = f"{self.__class__} There are no data sets matching the input parameters in file list: {rds_dict[self.RDS_INFILES_NAME]}. One matching file required."
+            self.log_raise_exception(ValueError, msg)
+
+        elif n_files > 1:
+            msg = f"{self.__class__} There are {n_files} data sets matching the input parameters in file list: {rds_dict[self.RDS_INFILES_NAME]}. One matching file required."
+            self.log_raise_exception(ValueError, msg)
+
+        else:
+            return target_file_dict
 
-        # get a year list for each file from the file name
-        yrs = os.path.basename(target_file).split('_')[-1].split('.')[0].split('-')
-        start_yr = self.validate_year(yrs[0][:4])
-        through_yr = self.validate_year(yrs[1][:4])
+    def build_year_list(self):
+        """Construct a list of years that the data provides."""
 
-        # ensure start year is not greater than through year
-        if start_yr > through_yr:
-            msg = f"{self.__class__} Start year '{start_yr}' > through year '{through_yr}' for emulator file '{target_file}'"
-            logging.error(msg)
-            raise ValueError(msg)
+        # validate years
+        start_yr = self.validate_year(self.params[self.START_YEAR_FIELD])
+        through_yr = self.validate_year(self.params[self.THROUGH_YEAR_FIELD])
 
-        # create a list of years found in each scenario
         return list(range(start_yr, through_yr + 1, 1))
 
-    def build_data_list(self, rds_dict, target_files, year_list, target_scenario):
+    def build_data_list(self, rds_dict, target_file_dict, year_list):
         """Generate a list of values for a given variable name and target scenario.
 
         :param rds_dict:                A Python dictionary of {variable name: value arrays} derived from the emulator
                                         RDS file.
         :type rds_dict:                 dict
 
-        :param target_files:            A list of file names from the `infiles` variable from the RDS file that match
-                                        the user defined climate variable name.
-        :type target_files:             list
+        :param target_file_dict:        A dictonary of file names and their corresponding index from the
+                                        `infiles` variable from the RDS file that match the user defined
+                                        configuration parameters.
+                                        Format: {'files': ['<file path>'], 'file_index': [<file index>]}
+        :type target_file_dict:         dict
 
         :param year_list:               A list of integer years that encompass the data range per realization.
         :type year_list:                list
 
-        :param target_scenario:         The scenario name (e.g., rcp26)
-        :type target_scenario:          str
-
-        :return:                        A list of values for a given variable name and target scenario.
+        :return:                        A list of values for a target parameterization
 
         """
 
-        # get the number of years
-        len_yr_list = len(year_list)
+        # number of years for a realization
+        n_years = len(year_list)
 
-        d_scenario_tgav = {}
-        for idx, i in enumerate(target_files):
+        # index of file corresponding to the position of the window containing the data for the target params
+        file_index = target_file_dict['file_index'][0]
 
-            # split path by delim
-            f_split = os.path.basename(i).split('_')
+        # start and end index for the data window to extract
+        start_index = n_years * file_index
+        end_index = n_years * (file_index + 1)
 
-            # scenario name from file name
-            scn_name = f_split[3]
+        return rds_dict[self.RDS_TGAV_NAME][start_index:end_index]
 
-            # get first files ending index value for slicing out data per year
-            if idx == 0:
-                start_idx = 0
-                end_idx = len_yr_list
-            else:
-                start_idx = end_idx
-                end_idx += len_yr_list
+    def tgav_metadata(self, df, target_file_dict):
+        """Create a dictionary holding a data summary about the 'Tgav' dataset and parameter assumptions.
 
-            # add tgav data to scenario dict
-            if scn_name not in d_scenario_tgav:
-                d_scenario_tgav[scn_name] = rds_dict[self.RDS_TGAV_NAME][start_idx:end_idx]
+        :param df:
 
-            else:
-                msg = f"{self.__class__} Multiple scenarios in target files for {scn_name}."
-                logging.error(msg)
-                raise KeyError(msg)
+        :return:
 
-        # ensure the target scenario is in the dictionary
-        if target_scenario not in d_scenario_tgav:
-            msg = f"{self.__class__} Scenario '{target_scenario}' is not in the RDS supporting climate file options: '{d_scenario_tgav.keys()}'"
-            logging.error(msg)
-            raise KeyError(msg)
+        """
 
-        else:
-            return d_scenario_tgav[target_scenario]
+        from numpy import isfinite
+
+        meta_dict = {'rds_file': self.params[self.RDS_FILE_FIELD],
+                     'scenario': self.params[self.SCENARIO_FIELD],
+                     'climate_var_name': self.params[self.CLIMATE_VAR_NAME_FIELD],
+                     'source_climate_data': target_file_dict['files'][0],
+                     'units': self.params[self.UNITS_FIELD],
+                     'count': df['value'].count(),
+                     'mean': df['value'].mean(),
+                     'median': df['value'].median(),
+                     'min': df['value'].min(),
+                     'max': df['value'].max(),
+                     'std': df['value'].std(),
+                     'na_count': df['value'].isna().sum(),
+                     'null_count': df['value'].isnull().sum(),
+                     'all_finite': isfinite(df['value']).all()}
+
+        for k in meta_dict.keys():
+            logging.info(f"{self.__class__} 'Tgav' data summary: {k}=={meta_dict[k]}")
+            print(f"{self.__class__} '{self.TGAV_CAPABILITY_NAME}' data summary: {k}=={meta_dict[k]}")
+
+        return meta_dict

From c8cbe246e674463e3a70629e89e1c15faf375b18 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 14:07:26 -0400
Subject: [PATCH 12/18] test suite for tgav stub

---
 cassandra/test/test_tgav_stub.py | 190 +++++++++++++++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 cassandra/test/test_tgav_stub.py

diff --git a/cassandra/test/test_tgav_stub.py b/cassandra/test/test_tgav_stub.py
new file mode 100644
index 0000000..07d9644
--- /dev/null
+++ b/cassandra/test/test_tgav_stub.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+"""Test for the Tgav stub component."""
+
+import os
+import unittest
+import pkg_resources
+
+from cassandra.components import TgavStubComponent
+
+# if necessary, set the path to your R_HOME environment variable
+# os.environ['R_HOME'] = '/Library/Frameworks/R.framework/Resources'
+
+
+class TestTgavStubComponent(unittest.TestCase):
+
+    def setUp(self):
+        """Set up a HectorStub component for testing."""
+
+        capability_table = {}
+
+        # configuration
+        self.rds_file = pkg_resources.resource_filename('cassandra', 'test/data/fldgen-IPSL-CM5A-LR_test.rds')
+        self.climate_var_name = 'tasAdjust'
+        self.scenario = 'rcp26'
+        self.units = 'Kelvin'
+        self.start_year = 1861
+        self.through_year = 2099
+
+        # expected output
+        self.meta_dict = {'rds_file': self.rds_file,
+                          'scenario': self.scenario,
+                          'climate_var_name': self.climate_var_name,
+                          'source_climate_data': './training-data/tasAdjust_annual_IPSL-CM5A-LR_rcp26_18610101-20991231.nc',
+                          'units': self.units,
+                          'count': 239,
+                          'mean' : 286.8046116940164,
+                          'median': 286.30762280534697,
+                          'min': 284.78008340211915,
+                          'max': 288.6382439866686,
+                          'std': 1.182328423261446,
+                          'na_count': 0,
+                          'null_count': 0,
+                          'all_finite': True}
+
+        # instantiate class
+        self.stub = TgavStubComponent(capability_table)
+
+        # build parameterization
+        self.stub.addparam('rds_file', self.rds_file)
+        self.stub.addparam('climate_var_name', self.climate_var_name)
+        self.stub.addparam('scenario', self.scenario)
+        self.stub.addparam('units', self.units)
+        self.stub.addparam('start_year', self.start_year)
+        self.stub.addparam('through_year', self.through_year)
+        self.stub.finalize_parsing()
+
+        # read in RDS file to dictionary
+        self.rds_dict = self.stub.rds_to_dict()
+
+        # generate target file dictionary
+        self.target_file_dict = self.stub.build_file_dict(self.rds_dict)
+
+        # generate year list
+        self.year_list = self.stub.build_year_list()
+
+        # generate tgav list
+        self.tgav_list = self.stub.build_data_list(self.rds_dict, self.target_file_dict, self.year_list)
+
+        # generate tgav dataframe
+        self.tgav_df = self.stub.build_dataframe(self.year_list, self.tgav_list)
+
+    def test_log_raise_exception(self):
+        """Ensure correct exception is raised."""
+
+        with self.assertRaises(ValueError):
+            self.stub.log_raise_exception(ValueError, 'value_error', log_msg=False)
+
+    def test_validate_int(self):
+        """Expect correct exception and type return."""
+
+        with self.assertRaises(ValueError):
+            self.stub.validate_int('fail')
+
+        val = self.stub.validate_int('1984')
+        self.assertTrue(type(val), int)
+        self.assertEqual(val, 1984)
+
+    def test_validate_year(self):
+        """Expect correct exception."""
+
+        # check min bounds error
+        with self.assertRaises(ValueError):
+            self.stub.validate_year(-1)
+
+        # check max bounds error
+        with self.assertRaises(ValueError):
+            self.stub.validate_year(999999)
+
+    def test_validate_file_exist(self):
+        """Expect correct exception."""
+
+        with self.assertRaises(FileNotFoundError):
+            self.stub.validate_file_exist('/not/a/file.txt')
+
+        fcheck = self.stub.validate_file_exist(self.rds_file)
+        self.assertEqual(fcheck, self.rds_file)
+
+    def test_rds_to_dict(self):
+        """Check output dict for data."""
+
+        # check for keys
+        self.assertTrue('tgav' in self.rds_dict)
+        self.assertTrue('infiles' in self.rds_dict)
+
+        # check for data
+        self.assertEqual(len(self.rds_dict['tgav']), 956)
+        self.assertEqual(len(self.rds_dict['infiles']), 8)
+
+    def test_build_file_dict(self):
+        """Ensure correct exception and content."""
+
+        # check for missing infiles key
+        with self.assertRaises(KeyError):
+            self.stub.build_file_dict({})
+
+        # check for no matching data
+        with self.assertRaises(ValueError):
+            self.stub.build_file_dict({'infiles': []})
+
+        # check for too many matching files
+        with self.assertRaises(ValueError):
+            self.stub.build_file_dict({'infiles': ['a', 'a']})
+
+        # valid outputs
+        self.assertEqual(self.target_file_dict['files'][0], self.meta_dict['source_climate_data'])
+        self.assertEqual(self.target_file_dict['file_index'][0], 0)
+
+    def test_build_year_list(self):
+        """Ensure year list returns correct number of years."""
+
+        # check type
+        self.assertTrue(type(self.year_list), list)
+
+        # check first and last year
+        self.assertEqual(self.year_list[0], self.start_year)
+        self.assertEqual(self.year_list[-1], self.through_year)
+
+        # check the number of years
+        self.assertEqual(self.through_year - self.start_year + 1, len(self.year_list))
+
+    def test_build_data_list(self):
+        """Check expected data outcome."""
+
+        self.assertTrue(type(self.tgav_list), list)
+
+        # check number of values for a single scenario and variable
+        self.assertEqual(len(self.tgav_list), 239)
+
+    def test_build_dataframe(self):
+        """Confirm fields and shape."""
+
+        # check data frame shape
+        self.assertEqual(self.tgav_df.shape, (239, 5))
+
+        # check column names
+        self.assertEqual(list(self.tgav_df.columns), ['year', 'scenario', 'variable', 'value', 'units'])
+
+    def test_tgav_metadata(self):
+        """Test expected output."""
+
+        meta_dict = self.stub.tgav_metadata(self.tgav_df, self.target_file_dict)
+
+        # check like keys
+        self.assertEqual(meta_dict.keys(), self.meta_dict.keys())
+
+        # check value equality
+        for k in meta_dict.keys():
+            self.assertEqual(meta_dict[k], self.meta_dict[k])
+
+    def test_run_component(self):
+        """Test expected output."""
+
+        rval = self.stub.run_component()
+
+        # test run success
+        self.assertEqual(rval, 0)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 0eaabf459d9f8588bda5d8ba93733324114b2c2b Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 14:07:47 -0400
Subject: [PATCH 13/18] reduced data for tgav tests

---
 cassandra/test/data/fldgen-IPSL-CM5A-LR_test.rds | Bin 0 -> 3934 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 cassandra/test/data/fldgen-IPSL-CM5A-LR_test.rds

diff --git a/cassandra/test/data/fldgen-IPSL-CM5A-LR_test.rds b/cassandra/test/data/fldgen-IPSL-CM5A-LR_test.rds
new file mode 100644
index 0000000000000000000000000000000000000000..9bcead5453e351a85434ec7c2eb9fab228c137a6
GIT binary patch
literal 3934
zcmV-k525fMiwFP!000001MQi4G}qg^$G_$&BC~{wD9Jn&u??9^5ea2XWT@}wD<zeZ
z6cLd`MS~O>Iu)fl5h<xogA}1qhERsD%){^Pch5ifu655^=ic91=O3%Ar)NLIe)j(C
zy<dCBnU_YRvC-H#S!iq=_$Q6Saih749*xE-422bc1z>+j==HY;Ig4@#z1Awp+oz1s
z-^b?Y$jG5RI_ueQUkJT%5zlMWI70tqZ}K*49ie{+^epO4f*hx`7mX15$0#RPbq>h6
zoK4=sgx=uloOR(Aq1OrKbT2y%xs_vpu@CWLr}up9LOwHwFY!YD!~I9gLf}uyvtQf^
zJG1lA9}AJEq2tZuC$#I;eyP4d=uJN7rJ_<{_v}0zE(-f2UdEF}g#I#I@4*ux)XQ@;
z)EE7|zhmj3FN1N!4HdjvI#b`E+-+Ave<!*{^o0TJKLQn+8)3J!e!qu~x?&gW#Xo@F
zWp{SjVfa_5dtB{>l$?(DQzrD9r)!jClOac@f7$AyU(uHqpUt7aI`?=Y0`W5Ip2^O{
zXWnt8*Q35>2Z7^@(JtE6-;I9iPWfeS?Zi0MLQ~$|#CS}D20U9Z{(R%}iu2(=?z`bk
z1^k$HMXnZ*+qHu&=D@G0nk6k5_L7F2l{51gk^eL`4f7yh7&Ih{cx&dWxtO<)74lcq
zwV|)DjH5fj&h+XraD={-cXQ1>^gG^P*YtzXKl96&-$%aYuyZzCXW$n(&opu^^4siD
zU`oNhlp&rRMCh$X=E>LF2>t7#FoUFH@V|S0*!y?*R~-^RaUOX`iuP_WBlIt|oAU(O
z3B4s}v(sBXwEJ2lq~d$k<(?~M3&VUw{vjVdjJT47BUNhff9BfkZHs<V#@ovR;D05d
zPuK+ZeLRakyCV<pHc7Wa$PkTkc@^|aZ~R(z3UQBwY#J%Z-2-iXSXZryCbZ;C`0a{R
zm=lfmwg+xWI|#jFeZzW=YC>;!yRPC^N$B5tIKFY=dp22qXcF#5+;UZqC~J&sgTdT0
z3D9$&Y_eB}bPA+B*N5CV^6oYp>dKx!P&$A*z29)%e2F|0{5)P5M;lwifn?ZQxV=mj
zJJ7GDLsvNR)c-g*XW281TeWkM9oBDs+xla6`N${bSANS6c{Z6>md!&QQr4t{1#tz^
zN=XjLQ=5Izc`wGvt@_Ft{w-#4ZW#wrU+YaP{<~-|Vu@(L`f6=s|E_}d^Hr_RAS?y4
zq;X&IYDnRX$tV8E$9Jtr;}QH;nHPM({%bAOII&&;cBK}-xld4cYfrz4$IN$MoRS-e
zc~W^%XO4B+Su5$>WCwj|;QkTh?b2g=ynvI?e+uT@IEVZlm%J{u`66D8T{~eLq4yZ1
zhRwYWKW>_v&H=<n7wCjxee?+Nx>e!3c6+PNdtOTD{lcOyS=d+ovKBHou`UK5rTh9~
zKMaO!vJJyJ8#utKeFpUn_`T0{tRi%#VTGk8&X-@0`!A33!q5K-V<-CSAKAB)4fD<X
zm7c|VgwTimeD3mHgdSX(xd8c?M&-w(rqKRWKGp=|8g94du&p8V(IqC-P%fmgftj2Z
zp^qi+<IY4qlOt^gv?)T-qJ+ZSrwM)hMdi^doJXT9jt<vxUQE*36|Z1lObN_2xV(T+
zY;2LjOHV@ASa*o~1neJ$;wbFHsewi*gCDTlzW#}GlA={)tcb+<G#%p6x+D_%yBh*S
zSRYfFyd^T|k3*7$MKg#{TnX|%ea&cp@7r?-^-eD5`M!4>p;%&?&nmAX^ogpNVC@U=
z=dWX{ts)c;C2P4-lu(=}P7l9RL4SIuyNf(wCwEq-{e*mcU?So*q-oSO>1@c2B7<ei
zAS(y<hw8!Kwce|*6}qy1nv@Xo^p0w5hMn_%ja8@^p;)J$o8HNXpN7f@O(jCH=RY{?
zph_ri$FDr!GU1mg8UEq!O#RvK-L0WJMQ`P%K|i-qxNHaf&vqGzaU(u$4ksHA>^tsc
z)Jmc5oJ;=A{;)?W9G)_Rz47X6*9hblF^;!i4?ml`gL3OIAD>N~Lr%ai&wx|12kk;l
zdErIybJ?fyQwMgo*4ou8(0;aKdkKS3ygEh>X-|+ZFn!136xgS`Lb3lT?h?QAoSQL^
zN<W^j&OpAj?)UoakVfIRmD4d!@sTyXZLrf6y4%kp-z^u<U2jn5n3ZKQ=8;qWV72Bs
z<dX~HSDu&|N5{Ix8pzZx&4qi>-rf44WF_LebrRe!qyOnJmwM-!?<BwsS401K!>dFv
z51h-Jr4HyKueQ&j4AjlO@xG!o`r!-<x^QA6@)#D0s9?T1?NZ}z7r;Jcwf>9BEO%$Q
zJImc!?*8-K9oU%=k%;r5|MI5ba?IBNlgrmP7V&Rx?bVki^r78$Wxc3#aBZRT+C$*&
zx&cRiteY{hToQ+V$B!TVY-fn}2CK*1;OnsV8VQ~l^t1VQ-%gBcOyHEl+5*TWFO)Ro
z!8>CbGu;^a#hL(CRrv20(lXcz9&hQX?YM*X{U6Pyo50DC{!`UBFKMriKfD7Tk54m4
z>y{(ld#M^1_(3CzC51a7BT9GO#W_x!iZR=wfpb<r*DYFfhBMMho6I3kaY~7U2kZ&g
zUNnM#JUfgwT+qb%Eimt(Q3w2=Z$1`;JX~?I1^xVJKg5tbl8W=*Cer?b;LLgNXlRoK
zyV=N`?t|b_d{U7=ILVigP=50m{Ldcxaj>0G^IWZW=NS`fZdUfXLHP62pBjaNg97HH
zvI2GSM^)GTtcKoZYV5s;P{LnaS6eQG&Wt`Z!((BKx=Z_;5#K+2sLc@aRD7!6K0*n8
z(0?5V4hywC)K16!U~a&Jln!uEIR3tz(jcKk0yuBP;GQ9&s`w#^4Kg(2_dUN8O001_
zO90#y_tTZS1ultrXGwG7J|Gsd;L^Y&#MdjwC5917dcx&}$tuL{y8KlT{fWqK8PZTB
zl<0P`CFRH~nz#Dx3-C~M+lWI7>KE5o7IOsL7c-3>XlO$Fym|*7^dqV{mOGc1P*RJ!
zE-XQPQm?Yse0M?KUp3`AmXHx5585#<VUJ;PoLiK*TtM}1M?y&!=eFMMgTH=iwf+e5
zrmL;JJAif<Kgk_m5qD<kitD(4%#U)kW_?R288cbIx*S3+aO5gHhWTBvrf*v==1(H>
zj;kT=MUoHmlW&xxy*;ntFy?o@!N{F0^9dz;y7*g~+AMcxxjW0<S?>Px++_}GC$8*=
z)TI8f1@HQ-SG34s9{PK;?p4qDF=VPtu}+xmFXy#j{WCqCdR#D`5&eaaxN#nh*`EsE
zz7agst$QDO3A%{{S;GpBzU<XfmIOa<=0&FAojexMsrcp!;tXHgNa?^|=TNI5_REC&
zxdl`<WS>m-670{3Edc@hlJVZL_dCu4&QA7Nu5AU6Co@&LJLB=rm9uH&D#knZq=!`?
zzQ?o=%dPp?7gGtVYdq`0%i>Z8z8lD6TCLUMIP>1EDKWyi%_h-Pv=`^?WMXA&OanMA
zQxU@D0S;^ZY&c^FF2^KUoM%P9dy-9*Zo$7*tHc_7WaH(Czqu9MEnswd;GM{IRbs{Y
zqu^-UyG6&rSAoY-dMvS!DGSwH!7W~c+Sr<T;Ii5##=8(g%?YP|bcMoh6B*47KJnS=
z=KA~w{ikoTy9u~TZB-aBfM5P4FX066`|9_yT0Yp19vIFC?*uDiZ#+ZZxr-y1c7@>Y
zy=Pa8z(c;1RdZI|11CK{<!E%nud=XsB{(X4OTwz768<KaTyt`f?|gPp2JRJt4BuV9
zaGwx6vbBKVT`e-HtgyafhSP4}xNvWfEExEBstfIVFC<2RhvJ+%UYd!}Tbh-A;oUCo
zu52>%{|F*a!+XX*fRife<4)d)|8nte5$cpNvhs2HLMXZC#amPQVV@uPZ5ZQ_70I6R
z#P?iKEum70cd~SBXi*gSD-*Nbqy9eZ3SZbfrXf#qOul_csKr6-*2>_yqEhCmD2z|x
zx8o<49wC&<<x=s`!)T8Slp4bOU8a|@Q5^RuIq9LDCD#Zgf3;N7X&d4nQ#-7ggjzC@
z(O}p}D5Zex`0wt7Qr<6htmwfkcW1df%iUS-{`1`JJI$H6>LcW{4i5VMnRo8{12tIp
zzpmK?$SZ=E1-nlq;JXYNsW3$F{vEb5j(?pF{b`>3RqV^bhJuMP?8h<f%h~dH$4;7M
zsqvlw4<{s5OunN%b#Hkr*4-3uL|dmPbOu+WS{~l7X)4Vw;PAMW`lrPX;Gj?Qru&%p
z$<mi!wROS2EA6Ffc#pD#=$6aiUBkj?cy$FjE0xn_Ap-es>`m?{xTqdsn5B((X?^&@
z!%^U<j@`Xv=FI;O|7c$p4W2e#jaaq|JS=^ZUkHBiMVjhbAV2ThZhNh#&?V|WX@I+Y
zGO~K6k$At}x4rUXJ-D~&+q+TZ6-;S(8tM#w4GC<wM;+qP&!rB|@Fx1vV+!ZGh`L&<
zh!S|lJ+PnU54^9Ab{Md30bfdQS+T%ha(~NluTt2FZpah`9vP?p+#L@4I@YK#@J*K4
zpk2rSN9scNS%QNLhw_G0FGKPNJ5_uGPeP=4VtS#UcI9wJ9rBFoAhsEvjqYpD#k)_z
zpXl--UMZT+nF%f`(c^kfni6VxCh_S7k5}}Qy#6`RN2Cqh-+*6v`&T^#-?frsZ)<_i
znxj!f%#BcLcAMn|F@I|D8^XN6Tb=2|Y$5pT8zzeX2EJ)0`3{zVmn(;_)Hk#dik!E*
zw>*PTMiNSu>9~&&mt!x^;@)C-L3?8o-kYmWt7cSBpuOkY{0@A#H8rb)guqwxs?U8l
zaF4Nw&TkwnA(UA|JJU^pP_|3tf7OGt7K<nCN^sw?YAG*fK(}TcdFm$tnYxPq6W;r4
z6UuI^u^^O91~EL&g}8b4x|f;>WgoDiHWs?`*0HhA@N?=Iy~u<2uhWC!(FAbcrK`~G
zXBv&h&Pt=vSZM4to|#_=`}hYz@c-AZfY~1Y+eD*H;YScZ+<&@Ru22o3JpF_HgSV^r
zc!qeYhIlfJeRk|-gt&VK2k-U_bhldXV5?%fR@+#`*4~}+-ld`AuCAw}u7*D<8fq(7
zs%vPfuL$=3Kj+iZ{?F#qQTxy4)BDT%EdFP{UDW?)`u=vhzn;Cno$jwE?{BC3-{$VW
sFIA3U&!BCLKNlwJpN;Gx+dV`7{5O|>u&;mMzg?#O1)YpK!0sRb0Qqwjp8x;=

literal 0
HcmV?d00001


From 0bccfbe3b922ef7a91e7dbe4a469e319f9640bc7 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 14:08:19 -0400
Subject: [PATCH 14/18] add data frame bulder method

---
 cassandra/components.py | 97 +++++++++++++++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 18 deletions(-)

diff --git a/cassandra/components.py b/cassandra/components.py
index a7c6055..37d03d4 100644
--- a/cassandra/components.py
+++ b/cassandra/components.py
@@ -1229,7 +1229,6 @@ def __init__(self, cap_tbl):
 
     def run_component(self):
         """Run the TgavStubComponent component"""
-        import pandas as pd
 
         # ensure required params are present
         self.validate_params()
@@ -1247,17 +1246,11 @@ def run_component(self):
         tgav_list = self.build_data_list(rds_dict, target_file_dict, year_list)
 
         # build a pandas data frame to hold tgav output
-        tgav_df = pd.DataFrame({'year': year_list,
-                                'value': tgav_list})
+        tgav_df = self.build_dataframe(year_list, tgav_list)
 
         # report data summary
         meta_dict = self.tgav_metadata(tgav_df, target_file_dict)
 
-        # additional expected fields
-        tgav_df['scenario'] = self.params[self.SCENARIO_FIELD]
-        tgav_df['variable'] = self.TGAV_CAPABILITY_NAME
-        tgav_df['units'] = self.params[self.UNITS_FIELD]
-
         # add to cassandra result queue
         self.addresults(self.TGAV_CAPABILITY_NAME, tgav_df[self.TGAV_FIELD_ORDER])
         self.addresults('tgav_metadata', meta_dict)
@@ -1344,9 +1337,26 @@ def validate_int(self, value):
         try:
             return int(value)
 
-        except TypeError:
+        except ValueError:
             msg = f"{self.__class__} Value '{value}' not able to be converted to an integer as expected."
-            self.log_raise_exception(TypeError, msg)
+            self.log_raise_exception(ValueError, msg)
+
+    def validate_file_exist(self, file_path):
+        """Ensure file exists.
+
+        :param file_path:               Full path with file name an extension to in input file.
+        :type file_path:                str
+
+        :return:                        Validated file path
+
+        """
+
+        if os.path.isfile(file_path):
+            return file_path
+
+        else:
+            msg = f"Input file '{file_path}' does not exist."
+            self.log_raise_exception(FileNotFoundError, msg)
 
     def rds_to_dict(self):
         """Read in and convert an RDS file to a Python dictionary.
@@ -1363,7 +1373,8 @@ def rds_to_dict(self):
         read_rds = robjects.r['readRDS']
 
         # create ListVector object
-        lvect = read_rds(self.params[self.RDS_FILE_FIELD])
+        rds_file = self.validate_file_exist(self.params[self.RDS_FILE_FIELD])
+        lvect = read_rds(rds_file)
 
         # convert the ListVector object to a Python dictionary
         return dict(zip(lvect.names, map(list, list(lvect))))
@@ -1382,8 +1393,21 @@ def build_file_dict(self, rds_dict):
 
         """
 
+        try:
+            infile_list = rds_dict[self.RDS_INFILES_NAME]
+        except KeyError:
+            msg = f"Field '{self.RDS_INFILES_NAME}' is not in the RDS dictionary."
+            self.log_raise_exception(KeyError, msg)
+
         target_file_dict = {}
-        for index, i in enumerate(rds_dict[self.RDS_INFILES_NAME]):
+
+        # add file name to dictionary
+        files = target_file_dict.setdefault('files', [])
+
+        # add file index to dictionary
+        file_index = target_file_dict.setdefault('file_index', [])
+
+        for index, i in enumerate(infile_list):
 
             # get file name from path
             base = os.path.basename(i)
@@ -1395,20 +1419,20 @@ def build_file_dict(self, rds_dict):
                     and (str(self.params[self.THROUGH_YEAR_FIELD]) in base):
 
                 # add file name to dictionary
-                target_file_dict.setdefault('files', []).append(i)
+                files.append(i)
 
                 # add file index to dictionary
-                target_file_dict.setdefault('file_index', []).append(index)
+                file_index.append(index)
 
         # the number of target files found matching the search criteria
         n_files = len(target_file_dict['files'])
 
         if n_files == 0:
-            msg = f"{self.__class__} There are no data sets matching the input parameters in file list: {rds_dict[self.RDS_INFILES_NAME]}. One matching file required."
+            msg = f"{self.__class__} There are no data sets matching the input parameters in file list: {infile_list}. One matching file required."
             self.log_raise_exception(ValueError, msg)
 
         elif n_files > 1:
-            msg = f"{self.__class__} There are {n_files} data sets matching the input parameters in file list: {rds_dict[self.RDS_INFILES_NAME]}. One matching file required."
+            msg = f"{self.__class__} There are {n_files} data sets matching the input parameters in file list: {infile_list}. One matching file required."
             self.log_raise_exception(ValueError, msg)
 
         else:
@@ -1455,12 +1479,49 @@ def build_data_list(self, rds_dict, target_file_dict, year_list):
 
         return rds_dict[self.RDS_TGAV_NAME][start_index:end_index]
 
+    def build_dataframe(self, year_list, tgav_list):
+        """Build output data frame for Tgav.
+
+        :param year_list:               A list of integer years that encompass the data range per realization.
+        :type year_list:                list
+
+        :param tgav_list:               A list of values for a given variable name and target scenario
+        :type tgav_list:                list
+
+        :param target_file_dict:        A dictonary of file names and their corresponding index from the
+                                        `infiles` variable from the RDS file that match the user defined
+                                        configuration parameters.
+                                        Format: {'files': ['<file path>'], 'file_index': [<file index>]}
+        :type target_file_dict:         dict
+
+        :return:                        A data frame holding Tgav outputs and required ancillary data
+
+        """
+        import pandas as pd
+
+        # build a pandas data frame to hold tgav output
+        df = pd.DataFrame({'year': year_list, 'value': tgav_list})
+
+        # additional expected fields
+        df['scenario'] = self.params[self.SCENARIO_FIELD]
+        df['variable'] = self.TGAV_CAPABILITY_NAME
+        df['units'] = self.params[self.UNITS_FIELD]
+
+        return df[self.TGAV_FIELD_ORDER]
+
     def tgav_metadata(self, df, target_file_dict):
         """Create a dictionary holding a data summary about the 'Tgav' dataset and parameter assumptions.
 
-        :param df:
+        :param df:                      A data frame holding Tgav outputs and required ancillary data
+        :type df:                       data frame
+
+        :param target_file_dict:        A dictonary of file names and their corresponding index from the
+                                        `infiles` variable from the RDS file that match the user defined
+                                        configuration parameters.
+                                        Format: {'files': ['<file path>'], 'file_index': [<file index>]}
+        :type target_file_dict:         dict
 
-        :return:
+        :return:                        A dictionary of metadata for the Tgav data
 
         """
 

From e72c04332b1ad696285f3bdd3b89d3ccc5643ae5 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 15:32:36 -0400
Subject: [PATCH 15/18] update reqs and install r for ci

---
 .travis.yml      | 1 +
 requirements.txt | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 3eb28ff..df60737 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,7 @@ matrix:
       dist: xenial
 cache: pip
 install:
+  - echo 'source("https://bioconductor.org/biocLite.R"); biocLite("S4Vectors"); biocLite("GenomicRanges")' > install.R
   - pip install git+https://github.com/JGCRI/gcam_reader
   - pip install git+https://github.com/JGCRI/tethys
   - pip install git+https://github.com/JGCRI/xanthos
diff --git a/requirements.txt b/requirements.txt
index f757a9e..35fae0a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,4 @@
 configobj>=5.0.6
 pandas>=0.20
+numpy>=1.16
+rpy2>=2.9

From 84290411d3dd1ab58614efd2a1a6b707dbdd1bc6 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 15:46:11 -0400
Subject: [PATCH 16/18] working on ci

---
 .travis.yml | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index df60737..fea2f81 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,15 +1,35 @@
 language: python
+
 matrix:
   include:
-    - python: 3.6
-    - python: 3.7
+    - os: linux
       dist: xenial
+      python: 3.6
+    - os: linux
+      dist: xenial
+      python: 3.7
+
+before_install:
+  - |
+    sudo apt-get install -y lsb-release
+      sudo apt-key adv \
+        --keyserver keyserver.ubuntu.com \
+        --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+      sudo add-apt-repository \
+        --yes \
+        "deb https://cloud.r-project.org/bin/linux/ubuntu/ $(lsb_release -c -s)-cran36/"
+      cat /etc/apt/sources.list
+      sudo apt-get update -qq
+      sudo apt-get install -y r-base
+  - sudo Rscript ./install_r_packages.r dplyr ggplot2 tidyr
+
 cache: pip
+
 install:
-  - echo 'source("https://bioconductor.org/biocLite.R"); biocLite("S4Vectors"); biocLite("GenomicRanges")' > install.R
   - pip install git+https://github.com/JGCRI/gcam_reader
   - pip install git+https://github.com/JGCRI/tethys
   - pip install git+https://github.com/JGCRI/xanthos
   - pip install .
+
 script:
   - python -m unittest

From 94a125905edeea8c7125d38590f033d907d157e0 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 15:49:05 -0400
Subject: [PATCH 17/18] working on ci

---
 .travis.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index fea2f81..b20f95c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,6 @@ before_install:
       cat /etc/apt/sources.list
       sudo apt-get update -qq
       sudo apt-get install -y r-base
-  - sudo Rscript ./install_r_packages.r dplyr ggplot2 tidyr
 
 cache: pip
 

From 630eaa2a411842e16dc87cd9182afca9448fc4b7 Mon Sep 17 00:00:00 2001
From: crvernon <chrisrvernon@gmail.com>
Date: Fri, 12 Jun 2020 16:03:30 -0400
Subject: [PATCH 18/18] working on ci

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index b20f95c..514c906 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,7 @@ before_install:
 cache: pip
 
 install:
+  - pip install -Iv rpy2==3.3.3
   - pip install git+https://github.com/JGCRI/gcam_reader
   - pip install git+https://github.com/JGCRI/tethys
   - pip install git+https://github.com/JGCRI/xanthos