Skip to content
39 changes: 39 additions & 0 deletions task_geo/dataset_builders/nasa/area_partition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import numpy as np


def area_partition(df_loc):
"""
Find a small number of small bboxes covering all the locations.

Parameters
----------
df_loc : pandas.DataFrame
Need to contain columns 'lat' and 'lon' with the coordinates.

Returns
-------
numpy.Array
Size is (number of boxes, 4).

"""

# location points
unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna()

# create new columns with top left corner of small bbox containing the
# location
unique_locations['bottom_left_lat'] = \
np.floor(unique_locations.lat / 4.5) * 4.5
unique_locations['bottom_left_lon'] = \
np.floor(unique_locations.lon / 4.5) * 4.5
unique_locations['top_right_lat'] = \
unique_locations['bottom_left_lat'] + 4.5
unique_locations['top_right_lon'] = \
unique_locations['bottom_left_lon'] + 4.5

bboxes = unique_locations[['bottom_left_lat',
'bottom_left_lon',
'top_right_lat',
'top_right_lon']]

return bboxes.drop_duplicates().values
104 changes: 99 additions & 5 deletions task_geo/dataset_builders/nasa/nasa_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests

from task_geo.dataset_builders.nasa.references import PARAMETERS
from task_geo.dataset_builders.nasa.area_partition import area_partition


def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str):
Expand Down Expand Up @@ -45,7 +46,88 @@ def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str):
return df


def nasa_connector(df_locations, start_date, end_date=None, parms=None):
def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
"""
Extract data for an area. The area is at most 10x10 degrees, the output is
at 1/2 degrees coordinates.

Parameters
----------
bbox : list
[min lat, min lon, max lat, max lon], half-degrees
max 10x10 degrees
str_start_date : string
str_end_date : string
parms_list : list

Returns
-------
df : pandas.DataFrame

"""
base_url = "https://power.larc.nasa.gov/cgi-bin/v1/DataAccess.py"

identifier = "identifier=Regional"
parms_str = f"parameters={','.join(parms_list)}"
user_community = "userCommunity=SSE"
temporal_average = "tempAverage=DAILY"
output_format = "outputList=JSON"
user = "user=anonymous"

url = (
f"{base_url}?request=execute&{identifier}&{parms_str}&"
f"startDate={str_start_date}&endDate={str_end_date}&"
f"bbox={str(bbox)[1:-1].replace('. ', '').replace(' ', '')}&"
f"{temporal_average}&{output_format}&"
f"{user_community}&{user}"
)
print(bbox)

response = requests.get(url).json()
data_json = requests.get(response['outputs']['json']).json()
data = [
pd.DataFrame({**{par: data_coord['properties']['parameter'][par]
for par in parms_list},
'lat': data_coord['geometry']['coordinates'][1],
'lon': data_coord['geometry']['coordinates'][0]
}) for data_coord in data_json['features']
]
df = pd.concat(data)
df.reset_index(inplace=True, drop=False)
return df.rename(columns={'index': 'date'})


def match_grid_point(locations, df_data):
"""
Match data from the grid to the single locations.

Parameters
----------
locations : pd.DataFrame
Unique locations.
df_data : pd.DataFrame
The grid data.

Returns
-------
pd.DataFrame
Output dataset.

"""
data = []
for row in locations.itertuples():
lat = 0.5 * round(2 * (row.lat - 0.25)) + 0.25
lon = 0.5 * round(2 * (row.lon - 0.25)) + 0.25
df_loc = df_data[(df_data.lat == lat) & (df_data.lon == lon)].copy()
df_loc.lat = row.lat
df_loc.lon = row.lon

data.append(df_loc)
return pd.concat(data).reset_index(drop=True, inplace=False)


def nasa_connector(df_locations, start_date, end_date=None, parms=None,
precision='area'):
"""Retrieve meteorologic data from NASA.

Given a dataset with columns country, region, sub_region, lon, and lat, for
Expand All @@ -60,6 +142,9 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None):
end_date(datetime): End date for the time series (optional)
parms(list of strings): Desired data, accepted are 'temperature',
'humidity', and 'pressure' (optional)
precision(string): Either 'area' (deafault) for lower precision but
much faster running time, or 'point' for more
precise but much slower running time.

Return:
------
Expand All @@ -85,7 +170,16 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None):
all_parms = list(itertools.chain.from_iterable([PARAMETERS[p] for p in parms]))
parms_str = f"parameters={','.join(all_parms)}"

return pd.concat([
nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str)
for row in locations.itertuples()
])
if precision == 'point':
return pd.concat([
nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str)
for row in locations.itertuples()
])
else:
df_data = pd.concat(
[nasa_data_area(list(bbox), str_start_date,
str_end_date, all_parms)
for bbox in area_partition(locations)]
)
df_data.reset_index(drop=True, inplace=True)
return match_grid_point(locations, df_data)