diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py new file mode 100644 index 0000000..e779c21 --- /dev/null +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -0,0 +1,39 @@ +import numpy as np + + +def area_partition(df_loc): + """ + Find a small number of small bboxes covering all the locations. + + Parameters + ---------- + df_loc : pandas.DataFrame + Need to contain columns 'lat' and 'lon' with the coordinates. + + Returns + ------- + numpy.Array + Size is (number of boxes, 4). + + """ + + # location points + unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna() + + # create new columns with top left corner of small bbox containing the + # location + unique_locations['bottom_left_lat'] = \ + np.floor(unique_locations.lat / 4.5) * 4.5 + unique_locations['bottom_left_lon'] = \ + np.floor(unique_locations.lon / 4.5) * 4.5 + unique_locations['top_right_lat'] = \ + unique_locations['bottom_left_lat'] + 4.5 + unique_locations['top_right_lon'] = \ + unique_locations['bottom_left_lon'] + 4.5 + + bboxes = unique_locations[['bottom_left_lat', + 'bottom_left_lon', + 'top_right_lat', + 'top_right_lon']] + + return bboxes.drop_duplicates().values diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index 8af33f8..7bf20f7 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -4,6 +4,7 @@ import requests from task_geo.dataset_builders.nasa.references import PARAMETERS +from task_geo.dataset_builders.nasa.area_partition import area_partition def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str): @@ -45,7 +46,88 @@ def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str): return df -def nasa_connector(df_locations, start_date, end_date=None, parms=None): +def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): + """ + Extract data for an area. The area is at most 10x10 degrees, the output is + at 1/2 degrees coordinates. + + Parameters + ---------- + bbox : list + [min lat, min lon, max lat, max lon], half-degrees + max 10x10 degrees + str_start_date : string + str_end_date : string + parms_list : list + + Returns + ------- + df : pandas.DataFrame + + """ + base_url = "https://power.larc.nasa.gov/cgi-bin/v1/DataAccess.py" + + identifier = "identifier=Regional" + parms_str = f"parameters={','.join(parms_list)}" + user_community = "userCommunity=SSE" + temporal_average = "tempAverage=DAILY" + output_format = "outputList=JSON" + user = "user=anonymous" + + url = ( + f"{base_url}?request=execute&{identifier}&{parms_str}&" + f"startDate={str_start_date}&endDate={str_end_date}&" + f"bbox={str(bbox)[1:-1].replace('. ', '').replace(' ', '')}&" + f"{temporal_average}&{output_format}&" + f"{user_community}&{user}" + ) + print(bbox) + + response = requests.get(url).json() + data_json = requests.get(response['outputs']['json']).json() + data = [ + pd.DataFrame({**{par: data_coord['properties']['parameter'][par] + for par in parms_list}, + 'lat': data_coord['geometry']['coordinates'][1], + 'lon': data_coord['geometry']['coordinates'][0] + }) for data_coord in data_json['features'] + ] + df = pd.concat(data) + df.reset_index(inplace=True, drop=False) + return df.rename(columns={'index': 'date'}) + + +def match_grid_point(locations, df_data): + """ + Match data from the grid to the single locations. + + Parameters + ---------- + locations : pd.DataFrame + Unique locations. + df_data : pd.DataFrame + The grid data. + + Returns + ------- + pd.DataFrame + Output dataset. + + """ + data = [] + for row in locations.itertuples(): + lat = 0.5 * round(2 * (row.lat - 0.25)) + 0.25 + lon = 0.5 * round(2 * (row.lon - 0.25)) + 0.25 + df_loc = df_data[(df_data.lat == lat) & (df_data.lon == lon)].copy() + df_loc.lat = row.lat + df_loc.lon = row.lon + + data.append(df_loc) + return pd.concat(data).reset_index(drop=True, inplace=False) + + +def nasa_connector(df_locations, start_date, end_date=None, parms=None, + precision='area'): """Retrieve meteorologic data from NASA. Given a dataset with columns country, region, sub_region, lon, and lat, for @@ -60,6 +142,9 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None): end_date(datetime): End date for the time series (optional) parms(list of strings): Desired data, accepted are 'temperature', 'humidity', and 'pressure' (optional) + precision(string): Either 'area' (deafault) for lower precision but + much faster running time, or 'point' for more + precise but much slower running time. Return: ------ @@ -85,7 +170,16 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None): all_parms = list(itertools.chain.from_iterable([PARAMETERS[p] for p in parms])) parms_str = f"parameters={','.join(all_parms)}" - return pd.concat([ - nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str) - for row in locations.itertuples() - ]) + if precision == 'point': + return pd.concat([ + nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str) + for row in locations.itertuples() + ]) + else: + df_data = pd.concat( + [nasa_data_area(list(bbox), str_start_date, + str_end_date, all_parms) + for bbox in area_partition(locations)] + ) + df_data.reset_index(drop=True, inplace=True) + return match_grid_point(locations, df_data)