From 7762c9b9c1ad1eea3e837077de7e2f6082d38ba1 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Mon, 4 May 2020 18:54:13 +0200 Subject: [PATCH 01/13] Update nasa_connector.py --- .../dataset_builders/nasa/nasa_connector.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index 8af33f8..1b8ad68 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -44,6 +44,55 @@ def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str): df['lat'] = lat return df +def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): + """ + Extract data for an area. The area is at most 10x10 degrees, the output is + at 1/2 degrees coordinates. + + Parameters + ---------- + bbox : list + [min lat, min lon, max lat, max lon], half-degrees + max 10x10 degrees + str_start_date : string + str_end_date : string + parms_list : list + + Returns + ------- + df : pandas.DataFrame + + """ + base_url = "https://power.larc.nasa.gov/cgi-bin/v1/DataAccess.py" + + identifier = "identifier=Regional" + parms_str = f"parameters={','.join(parms_list)}" + user_community = "userCommunity=SSE" + temporal_average = "tempAverage=DAILY" + output_format = "outputList=JSON" + user = "user=anonymous" + + url = ( + f"{base_url}?request=execute&{identifier}&{parms_str}&" + f"startDate={str_start_date}&endDate={str_end_date}&" + f"bbox={str(bbox)[1:-1].replace(' ', '')}&{temporal_average}&{output_format}&" + f"{user_community}&{user}" + ) + + response = requests.get(url).json() + data_json = requests.get(response['outputs']['json']).json() + data = [ + pd.DataFrame({ + **{par: data_coord['properties']['parameter'][par] + for par in parms_list}, + 'lat': data_coord['geometry']['coordinates'][0], + 'lon': data_coord['geometry']['coordinates'][1] + }) for data_coord in data_json['features'] + ] + df = pd.concat(data) + df.reset_index(inplace=True, drop=False) + return df.rename(columns={'index': 'date'}) + def nasa_connector(df_locations, start_date, end_date=None, parms=None): """Retrieve meteorologic data from NASA. From 99df4a01fc00cbfb38c92d18f6e8e1df509cc508 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Mon, 4 May 2020 18:56:54 +0200 Subject: [PATCH 02/13] Update nasa_connector.py --- task_geo/dataset_builders/nasa/nasa_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index 1b8ad68..ca963e5 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -75,7 +75,8 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): url = ( f"{base_url}?request=execute&{identifier}&{parms_str}&" f"startDate={str_start_date}&endDate={str_end_date}&" - f"bbox={str(bbox)[1:-1].replace(' ', '')}&{temporal_average}&{output_format}&" + f"bbox={str(bbox)[1:-1].replace(' ', '')}&" + f"{temporal_average}&{output_format}&" f"{user_community}&{user}" ) From e7032025d69d52fbbf6394794e677dede9f26df1 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Mon, 4 May 2020 19:01:12 +0200 Subject: [PATCH 03/13] Update nasa_connector.py --- task_geo/dataset_builders/nasa/nasa_connector.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index ca963e5..696d7f5 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -44,6 +44,7 @@ def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str): df['lat'] = lat return df + def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): """ Extract data for an area. The area is at most 10x10 degrees, the output is @@ -79,15 +80,14 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): f"{temporal_average}&{output_format}&" f"{user_community}&{user}" ) - + response = requests.get(url).json() data_json = requests.get(response['outputs']['json']).json() data = [ - pd.DataFrame({ - **{par: data_coord['properties']['parameter'][par] - for par in parms_list}, - 'lat': data_coord['geometry']['coordinates'][0], - 'lon': data_coord['geometry']['coordinates'][1] + pd.DataFrame({**{par: data_coord['properties']['parameter'][par] + for par in parms_list}, + 'lat': data_coord['geometry']['coordinates'][0], + 'lon': data_coord['geometry']['coordinates'][1] }) for data_coord in data_json['features'] ] df = pd.concat(data) From a585e39c77af5a4fa9d7335cd0366ecf070fd5ca Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Mon, 4 May 2020 19:03:21 +0200 Subject: [PATCH 04/13] Update nasa_connector.py --- task_geo/dataset_builders/nasa/nasa_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index 696d7f5..93c8128 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -88,7 +88,7 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): for par in parms_list}, 'lat': data_coord['geometry']['coordinates'][0], 'lon': data_coord['geometry']['coordinates'][1] - }) for data_coord in data_json['features'] + }) for data_coord in data_json['features'] ] df = pd.concat(data) df.reset_index(inplace=True, drop=False) From aba96ababac2c984702f9c9f5b9fdd37280b4967 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Fri, 8 May 2020 19:23:44 +0200 Subject: [PATCH 05/13] Create area_partition.py --- .../dataset_builders/nasa/area_partition.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 task_geo/dataset_builders/nasa/area_partition.py diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py new file mode 100644 index 0000000..1e768c7 --- /dev/null +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -0,0 +1,67 @@ +import numpy as np +from numpy.linalg import norm +import pandas as pd +from sklearn.cluster import KMeans + +def area_partition(df_loc): + """ + Find a small number of small bboxes covering all the locations. + + Using k-means repeatedly, we find a small number of boxes of side at most + 10 covering all the given geolocations. + + Notes: + - Does not consider -180 as close to 180. This can lead to suboptimal + solutions (but nothing too bad). + - The fit is doen with the Euclidean distance, not with the + L-infinity metric (which would fit squares). This leads to slightly + suboptimal solutions. + + Parameters + ---------- + df_loc : pandas.DataFrame + Need to contain columns 'lat' and 'lon' with the coordinates. + + Returns + ------- + numpy.Array + Size is (number of boxes, 4). + + """ + + # points to cluster + unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna().values + + # do k-means with increasing k until the maximal radius is no bigger than 5 + k = 0 + while True: + k += 1 + kmeans = KMeans(n_clusters = k).fit(unique_locations) + + cluster_centers = kmeans.cluster_centers_ + labels = kmeans.labels_ + cluster_radii = np.empty(k) + for i in range(k): + cluster_radii[i] = max([norm(el - cluster_centers[i]) + for el in unique_locations[labels == i, :]] + ) + max_radius = cluster_radii.max() + + # if the radius is small enough, create bboxes and return + if max_radius <= 5: + bboxes = np.empty((k, 4)) + for i in range(k): + cx, cy = cluster_centers[i, :] + r = cluster_radii[i] + bboxes[i] = [0.5*np.floor(2*(cx - r)), + 0.5*np.floor(2*(cy - r)), + 0.5*np.ceil(2*(cx + r)), + 0.5*np.ceil(2*(cy + r))] + # widen slightly bboxes with zero area + if bboxes[i, 0] == bboxes[i, 2]: + bboxes[i, 0] -= 0.5 + bboxes[i, 2] += 0.5 + if bboxes[i, 1] == bboxes[i, 3]: + bboxes[i, 1] -= 0.5 + bboxes[i, 3] += 0.5 + return bboxes From 3f8cb6a16ec38d2301daf9907d52a2e3da2fa17e Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Fri, 8 May 2020 19:27:17 +0200 Subject: [PATCH 06/13] Update area_partition.py --- task_geo/dataset_builders/nasa/area_partition.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index 1e768c7..82865ba 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -1,15 +1,15 @@ import numpy as np from numpy.linalg import norm -import pandas as pd from sklearn.cluster import KMeans + def area_partition(df_loc): """ Find a small number of small bboxes covering all the locations. - + Using k-means repeatedly, we find a small number of boxes of side at most 10 covering all the given geolocations. - + Notes: - Does not consider -180 as close to 180. This can lead to suboptimal solutions (but nothing too bad). @@ -36,7 +36,7 @@ def area_partition(df_loc): k = 0 while True: k += 1 - kmeans = KMeans(n_clusters = k).fit(unique_locations) + kmeans = KMeans(n_clusters=k).fit(unique_locations) cluster_centers = kmeans.cluster_centers_ labels = kmeans.labels_ From 3b3e4579e224ccdea7d66a3b6f70b1b4756fe30c Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Fri, 8 May 2020 19:30:05 +0200 Subject: [PATCH 07/13] Update area_partition.py --- task_geo/dataset_builders/nasa/area_partition.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index 82865ba..ec1f4d9 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -53,10 +53,10 @@ def area_partition(df_loc): for i in range(k): cx, cy = cluster_centers[i, :] r = cluster_radii[i] - bboxes[i] = [0.5*np.floor(2*(cx - r)), - 0.5*np.floor(2*(cy - r)), - 0.5*np.ceil(2*(cx + r)), - 0.5*np.ceil(2*(cy + r))] + bboxes[i] = [0.5 * np.floor(2 * (cx - r)), + 0.5 * np.floor(2 * (cy - r)), + 0.5 * np.ceil(2 * (cx + r)), + 0.5 * np.ceil(2 * (cy + r))] # widen slightly bboxes with zero area if bboxes[i, 0] == bboxes[i, 2]: bboxes[i, 0] -= 0.5 From 4fbd5a21b053469c5f7f943b998ed49d7110602c Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Fri, 8 May 2020 19:32:54 +0200 Subject: [PATCH 08/13] Update area_partition.py --- task_geo/dataset_builders/nasa/area_partition.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index ec1f4d9..2972cb4 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -1,6 +1,6 @@ -import numpy as np -from numpy.linalg import norm from sklearn.cluster import KMeans +from numpy.linalg import norm +import numpy as np def area_partition(df_loc): From 422e1ed1a31f6f24ddac3ad6b645439b8e0b17c4 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Fri, 8 May 2020 19:35:32 +0200 Subject: [PATCH 09/13] Update area_partition.py --- task_geo/dataset_builders/nasa/area_partition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index 2972cb4..dfc04f1 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -1,6 +1,6 @@ from sklearn.cluster import KMeans -from numpy.linalg import norm import numpy as np +from numpy.linalg import norm def area_partition(df_loc): From 27cefc574d12a71d7e73f7964344df08347e9883 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Fri, 8 May 2020 19:39:24 +0200 Subject: [PATCH 10/13] Update area_partition.py --- task_geo/dataset_builders/nasa/area_partition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index dfc04f1..ec1f4d9 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -1,6 +1,6 @@ -from sklearn.cluster import KMeans import numpy as np from numpy.linalg import norm +from sklearn.cluster import KMeans def area_partition(df_loc): From 6d96f2030f5d3f8b1a052512a58d290b192aff57 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Sat, 9 May 2020 19:30:47 +0200 Subject: [PATCH 11/13] Meteo data by areas Added functions to extract meteo data by area (less precise but much, much faster). nasa_connector has a new optional parameter 'precision', defaults to 'area', ie the new extraction. 'point' is the older method. --- .../dataset_builders/nasa/area_partition.py | 41 +++++++------ .../dataset_builders/nasa/nasa_connector.py | 59 ++++++++++++++++--- 2 files changed, 73 insertions(+), 27 deletions(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index ec1f4d9..508ef06 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -8,7 +8,7 @@ def area_partition(df_loc): Find a small number of small bboxes covering all the locations. Using k-means repeatedly, we find a small number of boxes of side at most - 10 covering all the given geolocations. + 4.5 covering all the given geolocations. Notes: - Does not consider -180 as close to 180. This can lead to suboptimal @@ -16,6 +16,7 @@ def area_partition(df_loc): - The fit is doen with the Euclidean distance, not with the L-infinity metric (which would fit squares). This leads to slightly suboptimal solutions. + - 5x5 boxes are still too big for the API Parameters ---------- @@ -45,23 +46,25 @@ def area_partition(df_loc): cluster_radii[i] = max([norm(el - cluster_centers[i]) for el in unique_locations[labels == i, :]] ) - max_radius = cluster_radii.max() - # if the radius is small enough, create bboxes and return - if max_radius <= 5: - bboxes = np.empty((k, 4)) - for i in range(k): - cx, cy = cluster_centers[i, :] - r = cluster_radii[i] - bboxes[i] = [0.5 * np.floor(2 * (cx - r)), - 0.5 * np.floor(2 * (cy - r)), - 0.5 * np.ceil(2 * (cx + r)), - 0.5 * np.ceil(2 * (cy + r))] - # widen slightly bboxes with zero area - if bboxes[i, 0] == bboxes[i, 2]: - bboxes[i, 0] -= 0.5 - bboxes[i, 2] += 0.5 - if bboxes[i, 1] == bboxes[i, 3]: - bboxes[i, 1] -= 0.5 - bboxes[i, 3] += 0.5 + # create bboxes + bboxes = np.empty((k, 4)) + for i in range(k): + cx, cy = cluster_centers[i, :] + r = cluster_radii[i] + bboxes[i] = [0.5 * np.floor(2 * (cx - r)), + 0.5 * np.floor(2 * (cy - r)), + 0.5 * np.ceil(2 * (cx + r)), + 0.5 * np.ceil(2 * (cy + r))] + # widen slightly bboxes with zero area + if bboxes[i, 0] == bboxes[i, 2]: + bboxes[i, 0] -= 0.5 + bboxes[i, 2] += 0.5 + if bboxes[i, 1] == bboxes[i, 3]: + bboxes[i, 1] -= 0.5 + bboxes[i, 3] += 0.5 + + # if max side smaller than 5, then return + max_side = (bboxes[:, [2, 3]] - bboxes[:, [0, 1]]).max() + if max_side < 5: return bboxes diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index 93c8128..21d1ccc 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -4,6 +4,7 @@ import requests from task_geo.dataset_builders.nasa.references import PARAMETERS +from task_geo.dataset_builders.nasa.area_partition import area_partition def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str): @@ -76,7 +77,7 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): url = ( f"{base_url}?request=execute&{identifier}&{parms_str}&" f"startDate={str_start_date}&endDate={str_end_date}&" - f"bbox={str(bbox)[1:-1].replace(' ', '')}&" + f"bbox={str(bbox)[1:-1].replace('. ', '').replace(' ', '')}&" f"{temporal_average}&{output_format}&" f"{user_community}&{user}" ) @@ -86,8 +87,8 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): data = [ pd.DataFrame({**{par: data_coord['properties']['parameter'][par] for par in parms_list}, - 'lat': data_coord['geometry']['coordinates'][0], - 'lon': data_coord['geometry']['coordinates'][1] + 'lat': data_coord['geometry']['coordinates'][1], + 'lon': data_coord['geometry']['coordinates'][0] }) for data_coord in data_json['features'] ] df = pd.concat(data) @@ -95,7 +96,37 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): return df.rename(columns={'index': 'date'}) -def nasa_connector(df_locations, start_date, end_date=None, parms=None): +def match_grid_point(locations, df_data): + """ + Match data from the grid to the single locations. + + Parameters + ---------- + locations : pd.DataFrame + Unique locations. + df_data : pd.DataFrame + The grid data. + + Returns + ------- + pd.DataFrame + Output dataset. + + """ + data = [] + for row in locations.itertuples(): + lat = 0.5 * round(2 * (row.lat - 0.25)) + 0.25 + lon = 0.5 * round(2 * (row.lon - 0.25)) + 0.25 + df_loc = df_data[(df_data.lat==lat) & (df_data.lon==lon)].copy() + df_loc.lat = row.lat + df_loc.lon = row.lon + + data.append(df_loc) + return pd.concat(data).reset_index(drop=True, inplace=False) + + +def nasa_connector(df_locations, start_date, end_date=None, parms=None, + precision='area'): """Retrieve meteorologic data from NASA. Given a dataset with columns country, region, sub_region, lon, and lat, for @@ -110,6 +141,9 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None): end_date(datetime): End date for the time series (optional) parms(list of strings): Desired data, accepted are 'temperature', 'humidity', and 'pressure' (optional) + precision(string): Either 'area' (deafault) for lower precision but + much faster running time, or 'point' for more + precise but much slower running time. Return: ------ @@ -135,7 +169,16 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None): all_parms = list(itertools.chain.from_iterable([PARAMETERS[p] for p in parms])) parms_str = f"parameters={','.join(all_parms)}" - return pd.concat([ - nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str) - for row in locations.itertuples() - ]) + if precision == 'point': + return pd.concat([ + nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str) + for row in locations.itertuples() + ]) + else: + df_data = pd.concat( + [nasa_data_area(list(bbox), str_start_date, + str_end_date, all_parms) + for bbox in area_partition(locations)] + ) + df_data.reset_index(drop=True, inplace=True) + return match_grid_point(locations, df_data) From ad6534c72ba3dbf7623d751ed88c03e443de7bed Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Sun, 10 May 2020 19:37:52 +0200 Subject: [PATCH 12/13] improved performance Rewrote area_partition. Method is "by hand" but faster than k-means. --- .../dataset_builders/nasa/area_partition.py | 71 ++++++------------- .../dataset_builders/nasa/nasa_connector.py | 1 + 2 files changed, 21 insertions(+), 51 deletions(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index 508ef06..9b15c29 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -1,23 +1,10 @@ import numpy as np -from numpy.linalg import norm -from sklearn.cluster import KMeans def area_partition(df_loc): """ Find a small number of small bboxes covering all the locations. - Using k-means repeatedly, we find a small number of boxes of side at most - 4.5 covering all the given geolocations. - - Notes: - - Does not consider -180 as close to 180. This can lead to suboptimal - solutions (but nothing too bad). - - The fit is doen with the Euclidean distance, not with the - L-infinity metric (which would fit squares). This leads to slightly - suboptimal solutions. - - 5x5 boxes are still too big for the API - Parameters ---------- df_loc : pandas.DataFrame @@ -30,41 +17,23 @@ def area_partition(df_loc): """ - # points to cluster - unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna().values - - # do k-means with increasing k until the maximal radius is no bigger than 5 - k = 0 - while True: - k += 1 - kmeans = KMeans(n_clusters=k).fit(unique_locations) - - cluster_centers = kmeans.cluster_centers_ - labels = kmeans.labels_ - cluster_radii = np.empty(k) - for i in range(k): - cluster_radii[i] = max([norm(el - cluster_centers[i]) - for el in unique_locations[labels == i, :]] - ) - - # create bboxes - bboxes = np.empty((k, 4)) - for i in range(k): - cx, cy = cluster_centers[i, :] - r = cluster_radii[i] - bboxes[i] = [0.5 * np.floor(2 * (cx - r)), - 0.5 * np.floor(2 * (cy - r)), - 0.5 * np.ceil(2 * (cx + r)), - 0.5 * np.ceil(2 * (cy + r))] - # widen slightly bboxes with zero area - if bboxes[i, 0] == bboxes[i, 2]: - bboxes[i, 0] -= 0.5 - bboxes[i, 2] += 0.5 - if bboxes[i, 1] == bboxes[i, 3]: - bboxes[i, 1] -= 0.5 - bboxes[i, 3] += 0.5 - - # if max side smaller than 5, then return - max_side = (bboxes[:, [2, 3]] - bboxes[:, [0, 1]]).max() - if max_side < 5: - return bboxes + # location points + unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna() + + # create new columns with top left corner of small bbox containing the + # location + unique_locations['bottom_left_lat'] = \ + np.floor(unique_locations.lat / 4.5) * 4.5 + unique_locations['bottom_left_lon'] = \ + np.floor(unique_locations.lon / 4.5) * 4.5 + unique_locations['top_right_lat'] = \ + unique_locations['bottom_left_lat'] + 4.5 + unique_locations['top_right_lon'] = \ + unique_locations['bottom_left_lon'] + 4.5 + + bboxes = unique_locations[['bottom_left_lat', + 'bottom_left_lon', + 'top_right_lat', + 'top_right_lon']] + + return bboxes.drop_duplicates().values diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index 21d1ccc..0d3d9d1 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -81,6 +81,7 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list): f"{temporal_average}&{output_format}&" f"{user_community}&{user}" ) + print(bbox) response = requests.get(url).json() data_json = requests.get(response['outputs']['json']).json() From 4bd9baa2f648a215189b342231a5e90a9ae76e40 Mon Sep 17 00:00:00 2001 From: DanielRobertNicoud <59664846+DanielRobertNicoud@users.noreply.github.com> Date: Sun, 10 May 2020 23:29:31 +0200 Subject: [PATCH 13/13] Lint fixing --- task_geo/dataset_builders/nasa/area_partition.py | 6 +++--- task_geo/dataset_builders/nasa/nasa_connector.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py index 9b15c29..e779c21 100644 --- a/task_geo/dataset_builders/nasa/area_partition.py +++ b/task_geo/dataset_builders/nasa/area_partition.py @@ -19,7 +19,7 @@ def area_partition(df_loc): # location points unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna() - + # create new columns with top left corner of small bbox containing the # location unique_locations['bottom_left_lat'] = \ @@ -30,10 +30,10 @@ def area_partition(df_loc): unique_locations['bottom_left_lat'] + 4.5 unique_locations['top_right_lon'] = \ unique_locations['bottom_left_lon'] + 4.5 - + bboxes = unique_locations[['bottom_left_lat', 'bottom_left_lon', 'top_right_lat', 'top_right_lon']] - + return bboxes.drop_duplicates().values diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py index 0d3d9d1..7bf20f7 100644 --- a/task_geo/dataset_builders/nasa/nasa_connector.py +++ b/task_geo/dataset_builders/nasa/nasa_connector.py @@ -113,15 +113,15 @@ def match_grid_point(locations, df_data): pd.DataFrame Output dataset. - """ + """ data = [] for row in locations.itertuples(): lat = 0.5 * round(2 * (row.lat - 0.25)) + 0.25 lon = 0.5 * round(2 * (row.lon - 0.25)) + 0.25 - df_loc = df_data[(df_data.lat==lat) & (df_data.lon==lon)].copy() + df_loc = df_data[(df_data.lat == lat) & (df_data.lon == lon)].copy() df_loc.lat = row.lat df_loc.lon = row.lon - + data.append(df_loc) return pd.concat(data).reset_index(drop=True, inplace=False)