From 7762c9b9c1ad1eea3e837077de7e2f6082d38ba1 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Mon, 4 May 2020 18:54:13 +0200
Subject: [PATCH 01/13] Update nasa_connector.py

---
 .../dataset_builders/nasa/nasa_connector.py   | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py
index 8af33f8..1b8ad68 100644
--- a/task_geo/dataset_builders/nasa/nasa_connector.py
+++ b/task_geo/dataset_builders/nasa/nasa_connector.py
@@ -44,6 +44,55 @@ def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str):
     df['lat'] = lat
     return df
 
+def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
+    """
+    Extract data for an area. The area is at most 10x10 degrees, the output is
+    at 1/2 degrees coordinates.
+
+    Parameters
+    ----------
+    bbox : list
+        [min lat, min lon, max lat, max lon], half-degrees
+        max 10x10 degrees
+    str_start_date : string
+    str_end_date : string
+    parms_list : list
+
+    Returns
+    -------
+    df : pandas.DataFrame
+
+    """
+    base_url = "https://power.larc.nasa.gov/cgi-bin/v1/DataAccess.py"
+
+    identifier = "identifier=Regional"
+    parms_str = f"parameters={','.join(parms_list)}"
+    user_community = "userCommunity=SSE"
+    temporal_average = "tempAverage=DAILY"
+    output_format = "outputList=JSON"
+    user = "user=anonymous"
+
+    url = (
+        f"{base_url}?request=execute&{identifier}&{parms_str}&"
+        f"startDate={str_start_date}&endDate={str_end_date}&"
+        f"bbox={str(bbox)[1:-1].replace(' ', '')}&{temporal_average}&{output_format}&"
+        f"{user_community}&{user}"
+    )
+    
+    response = requests.get(url).json()
+    data_json = requests.get(response['outputs']['json']).json()
+    data = [
+        pd.DataFrame({
+            **{par: data_coord['properties']['parameter'][par]
+            for par in parms_list},
+            'lat': data_coord['geometry']['coordinates'][0],
+            'lon': data_coord['geometry']['coordinates'][1]
+        }) for data_coord in data_json['features']
+    ]
+    df = pd.concat(data)
+    df.reset_index(inplace=True, drop=False)
+    return df.rename(columns={'index': 'date'})
+
 
 def nasa_connector(df_locations, start_date, end_date=None, parms=None):
     """Retrieve meteorologic data from NASA.

From 99df4a01fc00cbfb38c92d18f6e8e1df509cc508 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Mon, 4 May 2020 18:56:54 +0200
Subject: [PATCH 02/13] Update nasa_connector.py

---
 task_geo/dataset_builders/nasa/nasa_connector.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py
index 1b8ad68..ca963e5 100644
--- a/task_geo/dataset_builders/nasa/nasa_connector.py
+++ b/task_geo/dataset_builders/nasa/nasa_connector.py
@@ -75,7 +75,8 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
     url = (
         f"{base_url}?request=execute&{identifier}&{parms_str}&"
         f"startDate={str_start_date}&endDate={str_end_date}&"
-        f"bbox={str(bbox)[1:-1].replace(' ', '')}&{temporal_average}&{output_format}&"
+        f"bbox={str(bbox)[1:-1].replace(' ', '')}&"
+        f"{temporal_average}&{output_format}&"
         f"{user_community}&{user}"
     )
     

From e7032025d69d52fbbf6394794e677dede9f26df1 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Mon, 4 May 2020 19:01:12 +0200
Subject: [PATCH 03/13] Update nasa_connector.py

---
 task_geo/dataset_builders/nasa/nasa_connector.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py
index ca963e5..696d7f5 100644
--- a/task_geo/dataset_builders/nasa/nasa_connector.py
+++ b/task_geo/dataset_builders/nasa/nasa_connector.py
@@ -44,6 +44,7 @@ def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str):
     df['lat'] = lat
     return df
 
+
 def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
     """
     Extract data for an area. The area is at most 10x10 degrees, the output is
@@ -79,15 +80,14 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
         f"{temporal_average}&{output_format}&"
         f"{user_community}&{user}"
     )
-    
+
     response = requests.get(url).json()
     data_json = requests.get(response['outputs']['json']).json()
     data = [
-        pd.DataFrame({
-            **{par: data_coord['properties']['parameter'][par]
-            for par in parms_list},
-            'lat': data_coord['geometry']['coordinates'][0],
-            'lon': data_coord['geometry']['coordinates'][1]
+        pd.DataFrame({**{par: data_coord['properties']['parameter'][par]
+                         for par in parms_list},
+                      'lat': data_coord['geometry']['coordinates'][0],
+                      'lon': data_coord['geometry']['coordinates'][1]
         }) for data_coord in data_json['features']
     ]
     df = pd.concat(data)

From a585e39c77af5a4fa9d7335cd0366ecf070fd5ca Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Mon, 4 May 2020 19:03:21 +0200
Subject: [PATCH 04/13] Update nasa_connector.py

---
 task_geo/dataset_builders/nasa/nasa_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py
index 696d7f5..93c8128 100644
--- a/task_geo/dataset_builders/nasa/nasa_connector.py
+++ b/task_geo/dataset_builders/nasa/nasa_connector.py
@@ -88,7 +88,7 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
                          for par in parms_list},
                       'lat': data_coord['geometry']['coordinates'][0],
                       'lon': data_coord['geometry']['coordinates'][1]
-        }) for data_coord in data_json['features']
+                      }) for data_coord in data_json['features']
     ]
     df = pd.concat(data)
     df.reset_index(inplace=True, drop=False)

From aba96ababac2c984702f9c9f5b9fdd37280b4967 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Fri, 8 May 2020 19:23:44 +0200
Subject: [PATCH 05/13] Create area_partition.py

---
 .../dataset_builders/nasa/area_partition.py   | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 task_geo/dataset_builders/nasa/area_partition.py

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
new file mode 100644
index 0000000..1e768c7
--- /dev/null
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -0,0 +1,67 @@
+import numpy as np
+from numpy.linalg import norm
+import pandas as pd
+from sklearn.cluster import KMeans
+
+def area_partition(df_loc):
+    """
+    Find a small number of small bboxes covering all the locations.
+    
+    Using k-means repeatedly, we find a small number of boxes of side at most
+    10 covering all the given geolocations.
+    
+    Notes:
+        - Does not consider -180 as close to 180. This can lead to suboptimal
+        solutions (but nothing too bad).
+        - The fit is doen with the Euclidean distance, not with the
+        L-infinity metric (which would fit squares). This leads to slightly
+        suboptimal solutions.
+
+    Parameters
+    ----------
+    df_loc : pandas.DataFrame
+        Need to contain columns 'lat' and 'lon' with the coordinates.
+
+    Returns
+    -------
+    numpy.Array
+        Size is (number of boxes, 4).
+
+    """
+
+    # points to cluster
+    unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna().values
+
+    # do k-means with increasing k until the maximal radius is no bigger than 5
+    k = 0
+    while True:
+        k += 1
+        kmeans = KMeans(n_clusters = k).fit(unique_locations)
+
+        cluster_centers = kmeans.cluster_centers_
+        labels = kmeans.labels_
+        cluster_radii = np.empty(k)
+        for i in range(k):
+            cluster_radii[i] = max([norm(el - cluster_centers[i])
+                                    for el in unique_locations[labels == i, :]]
+                                   )
+        max_radius = cluster_radii.max()
+
+        # if the radius is small enough, create bboxes and return
+        if max_radius <= 5:
+            bboxes = np.empty((k, 4))
+            for i in range(k):
+                cx, cy = cluster_centers[i, :]
+                r = cluster_radii[i]
+                bboxes[i] = [0.5*np.floor(2*(cx - r)),
+                             0.5*np.floor(2*(cy - r)),
+                             0.5*np.ceil(2*(cx + r)),
+                             0.5*np.ceil(2*(cy + r))]
+                # widen slightly bboxes with zero area
+                if bboxes[i, 0] == bboxes[i, 2]:
+                    bboxes[i, 0] -= 0.5
+                    bboxes[i, 2] += 0.5
+                if bboxes[i, 1] == bboxes[i, 3]:
+                    bboxes[i, 1] -= 0.5
+                    bboxes[i, 3] += 0.5
+            return bboxes

From 3f8cb6a16ec38d2301daf9907d52a2e3da2fa17e Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Fri, 8 May 2020 19:27:17 +0200
Subject: [PATCH 06/13] Update area_partition.py

---
 task_geo/dataset_builders/nasa/area_partition.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index 1e768c7..82865ba 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -1,15 +1,15 @@
 import numpy as np
 from numpy.linalg import norm
-import pandas as pd
 from sklearn.cluster import KMeans
 
+
 def area_partition(df_loc):
     """
     Find a small number of small bboxes covering all the locations.
-    
+
     Using k-means repeatedly, we find a small number of boxes of side at most
     10 covering all the given geolocations.
-    
+
     Notes:
         - Does not consider -180 as close to 180. This can lead to suboptimal
         solutions (but nothing too bad).
@@ -36,7 +36,7 @@ def area_partition(df_loc):
     k = 0
     while True:
         k += 1
-        kmeans = KMeans(n_clusters = k).fit(unique_locations)
+        kmeans = KMeans(n_clusters=k).fit(unique_locations)
 
         cluster_centers = kmeans.cluster_centers_
         labels = kmeans.labels_

From 3b3e4579e224ccdea7d66a3b6f70b1b4756fe30c Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Fri, 8 May 2020 19:30:05 +0200
Subject: [PATCH 07/13] Update area_partition.py

---
 task_geo/dataset_builders/nasa/area_partition.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index 82865ba..ec1f4d9 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -53,10 +53,10 @@ def area_partition(df_loc):
             for i in range(k):
                 cx, cy = cluster_centers[i, :]
                 r = cluster_radii[i]
-                bboxes[i] = [0.5*np.floor(2*(cx - r)),
-                             0.5*np.floor(2*(cy - r)),
-                             0.5*np.ceil(2*(cx + r)),
-                             0.5*np.ceil(2*(cy + r))]
+                bboxes[i] = [0.5 * np.floor(2 * (cx - r)),
+                             0.5 * np.floor(2 * (cy - r)),
+                             0.5 * np.ceil(2 * (cx + r)),
+                             0.5 * np.ceil(2 * (cy + r))]
                 # widen slightly bboxes with zero area
                 if bboxes[i, 0] == bboxes[i, 2]:
                     bboxes[i, 0] -= 0.5

From 4fbd5a21b053469c5f7f943b998ed49d7110602c Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Fri, 8 May 2020 19:32:54 +0200
Subject: [PATCH 08/13] Update area_partition.py

---
 task_geo/dataset_builders/nasa/area_partition.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index ec1f4d9..2972cb4 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -1,6 +1,6 @@
-import numpy as np
-from numpy.linalg import norm
 from sklearn.cluster import KMeans
+from numpy.linalg import norm
+import numpy as np
 
 
 def area_partition(df_loc):

From 422e1ed1a31f6f24ddac3ad6b645439b8e0b17c4 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Fri, 8 May 2020 19:35:32 +0200
Subject: [PATCH 09/13] Update area_partition.py

---
 task_geo/dataset_builders/nasa/area_partition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index 2972cb4..dfc04f1 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -1,6 +1,6 @@
 from sklearn.cluster import KMeans
-from numpy.linalg import norm
 import numpy as np
+from numpy.linalg import norm
 
 
 def area_partition(df_loc):

From 27cefc574d12a71d7e73f7964344df08347e9883 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Fri, 8 May 2020 19:39:24 +0200
Subject: [PATCH 10/13] Update area_partition.py

---
 task_geo/dataset_builders/nasa/area_partition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index dfc04f1..ec1f4d9 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -1,6 +1,6 @@
-from sklearn.cluster import KMeans
 import numpy as np
 from numpy.linalg import norm
+from sklearn.cluster import KMeans
 
 
 def area_partition(df_loc):

From 6d96f2030f5d3f8b1a052512a58d290b192aff57 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Sat, 9 May 2020 19:30:47 +0200
Subject: [PATCH 11/13] Meteo data by areas

Added functions to extract meteo data by area (less precise but much, much faster).

nasa_connector has a new optional parameter 'precision', defaults to 'area', ie the new extraction. 'point' is the older method.
---
 .../dataset_builders/nasa/area_partition.py   | 41 +++++++------
 .../dataset_builders/nasa/nasa_connector.py   | 59 ++++++++++++++++---
 2 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index ec1f4d9..508ef06 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -8,7 +8,7 @@ def area_partition(df_loc):
     Find a small number of small bboxes covering all the locations.
 
     Using k-means repeatedly, we find a small number of boxes of side at most
-    10 covering all the given geolocations.
+    4.5 covering all the given geolocations.
 
     Notes:
         - Does not consider -180 as close to 180. This can lead to suboptimal
@@ -16,6 +16,7 @@ def area_partition(df_loc):
         - The fit is doen with the Euclidean distance, not with the
         L-infinity metric (which would fit squares). This leads to slightly
         suboptimal solutions.
+        - 5x5 boxes are still too big for the API
 
     Parameters
     ----------
@@ -45,23 +46,25 @@ def area_partition(df_loc):
             cluster_radii[i] = max([norm(el - cluster_centers[i])
                                     for el in unique_locations[labels == i, :]]
                                    )
-        max_radius = cluster_radii.max()
 
-        # if the radius is small enough, create bboxes and return
-        if max_radius <= 5:
-            bboxes = np.empty((k, 4))
-            for i in range(k):
-                cx, cy = cluster_centers[i, :]
-                r = cluster_radii[i]
-                bboxes[i] = [0.5 * np.floor(2 * (cx - r)),
-                             0.5 * np.floor(2 * (cy - r)),
-                             0.5 * np.ceil(2 * (cx + r)),
-                             0.5 * np.ceil(2 * (cy + r))]
-                # widen slightly bboxes with zero area
-                if bboxes[i, 0] == bboxes[i, 2]:
-                    bboxes[i, 0] -= 0.5
-                    bboxes[i, 2] += 0.5
-                if bboxes[i, 1] == bboxes[i, 3]:
-                    bboxes[i, 1] -= 0.5
-                    bboxes[i, 3] += 0.5
+        # create bboxes
+        bboxes = np.empty((k, 4))
+        for i in range(k):
+            cx, cy = cluster_centers[i, :]
+            r = cluster_radii[i]
+            bboxes[i] = [0.5 * np.floor(2 * (cx - r)),
+                         0.5 * np.floor(2 * (cy - r)),
+                         0.5 * np.ceil(2 * (cx + r)),
+                         0.5 * np.ceil(2 * (cy + r))]
+            # widen slightly bboxes with zero area
+            if bboxes[i, 0] == bboxes[i, 2]:
+                bboxes[i, 0] -= 0.5
+                bboxes[i, 2] += 0.5
+            if bboxes[i, 1] == bboxes[i, 3]:
+                bboxes[i, 1] -= 0.5
+                bboxes[i, 3] += 0.5
+
+        # if max side smaller than 5, then return
+        max_side = (bboxes[:, [2, 3]] - bboxes[:, [0, 1]]).max()
+        if max_side < 5:
             return bboxes
diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py
index 93c8128..21d1ccc 100644
--- a/task_geo/dataset_builders/nasa/nasa_connector.py
+++ b/task_geo/dataset_builders/nasa/nasa_connector.py
@@ -4,6 +4,7 @@
 import requests
 
 from task_geo.dataset_builders.nasa.references import PARAMETERS
+from task_geo.dataset_builders.nasa.area_partition import area_partition
 
 
 def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str):
@@ -76,7 +77,7 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
     url = (
         f"{base_url}?request=execute&{identifier}&{parms_str}&"
         f"startDate={str_start_date}&endDate={str_end_date}&"
-        f"bbox={str(bbox)[1:-1].replace(' ', '')}&"
+        f"bbox={str(bbox)[1:-1].replace('. ', '').replace(' ', '')}&"
         f"{temporal_average}&{output_format}&"
         f"{user_community}&{user}"
     )
@@ -86,8 +87,8 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
     data = [
         pd.DataFrame({**{par: data_coord['properties']['parameter'][par]
                          for par in parms_list},
-                      'lat': data_coord['geometry']['coordinates'][0],
-                      'lon': data_coord['geometry']['coordinates'][1]
+                      'lat': data_coord['geometry']['coordinates'][1],
+                      'lon': data_coord['geometry']['coordinates'][0]
                       }) for data_coord in data_json['features']
     ]
     df = pd.concat(data)
@@ -95,7 +96,37 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
     return df.rename(columns={'index': 'date'})
 
 
-def nasa_connector(df_locations, start_date, end_date=None, parms=None):
+def match_grid_point(locations, df_data):
+    """
+    Match data from the grid to the single locations.
+
+    Parameters
+    ----------
+    locations : pd.DataFrame
+        Unique locations.
+    df_data : pd.DataFrame
+        The grid data.
+
+    Returns
+    -------
+    pd.DataFrame
+        Output dataset.
+
+    """    
+    data = []
+    for row in locations.itertuples():
+        lat = 0.5 * round(2 * (row.lat - 0.25)) + 0.25
+        lon = 0.5 * round(2 * (row.lon - 0.25)) + 0.25
+        df_loc = df_data[(df_data.lat==lat) & (df_data.lon==lon)].copy()
+        df_loc.lat = row.lat
+        df_loc.lon = row.lon
+        
+        data.append(df_loc)
+    return pd.concat(data).reset_index(drop=True, inplace=False)
+
+
+def nasa_connector(df_locations, start_date, end_date=None, parms=None,
+                   precision='area'):
     """Retrieve meteorologic data from NASA.
 
     Given a dataset with columns country, region, sub_region, lon, and lat, for
@@ -110,6 +141,9 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None):
         end_date(datetime): End date for the time series (optional)
         parms(list of strings): Desired data, accepted are 'temperature',
                                 'humidity', and 'pressure' (optional)
+        precision(string): Either 'area' (deafault) for lower precision but
+                           much faster running time, or 'point' for more
+                           precise but much slower running time.
 
     Return:
     ------
@@ -135,7 +169,16 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None):
     all_parms = list(itertools.chain.from_iterable([PARAMETERS[p] for p in parms]))
     parms_str = f"parameters={','.join(all_parms)}"
 
-    return pd.concat([
-        nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str)
-        for row in locations.itertuples()
-    ])
+    if precision == 'point':
+        return pd.concat([
+            nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str)
+            for row in locations.itertuples()
+        ])
+    else:
+        df_data = pd.concat(
+            [nasa_data_area(list(bbox), str_start_date,
+                            str_end_date, all_parms)
+             for bbox in area_partition(locations)]
+        )
+        df_data.reset_index(drop=True, inplace=True)
+        return match_grid_point(locations, df_data)

From ad6534c72ba3dbf7623d751ed88c03e443de7bed Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Sun, 10 May 2020 19:37:52 +0200
Subject: [PATCH 12/13] improved performance

Rewrote area_partition. Method is "by hand" but faster than k-means.
---
 .../dataset_builders/nasa/area_partition.py   | 71 ++++++-------------
 .../dataset_builders/nasa/nasa_connector.py   |  1 +
 2 files changed, 21 insertions(+), 51 deletions(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index 508ef06..9b15c29 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -1,23 +1,10 @@
 import numpy as np
-from numpy.linalg import norm
-from sklearn.cluster import KMeans
 
 
 def area_partition(df_loc):
     """
     Find a small number of small bboxes covering all the locations.
 
-    Using k-means repeatedly, we find a small number of boxes of side at most
-    4.5 covering all the given geolocations.
-
-    Notes:
-        - Does not consider -180 as close to 180. This can lead to suboptimal
-        solutions (but nothing too bad).
-        - The fit is doen with the Euclidean distance, not with the
-        L-infinity metric (which would fit squares). This leads to slightly
-        suboptimal solutions.
-        - 5x5 boxes are still too big for the API
-
     Parameters
     ----------
     df_loc : pandas.DataFrame
@@ -30,41 +17,23 @@ def area_partition(df_loc):
 
     """
 
-    # points to cluster
-    unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna().values
-
-    # do k-means with increasing k until the maximal radius is no bigger than 5
-    k = 0
-    while True:
-        k += 1
-        kmeans = KMeans(n_clusters=k).fit(unique_locations)
-
-        cluster_centers = kmeans.cluster_centers_
-        labels = kmeans.labels_
-        cluster_radii = np.empty(k)
-        for i in range(k):
-            cluster_radii[i] = max([norm(el - cluster_centers[i])
-                                    for el in unique_locations[labels == i, :]]
-                                   )
-
-        # create bboxes
-        bboxes = np.empty((k, 4))
-        for i in range(k):
-            cx, cy = cluster_centers[i, :]
-            r = cluster_radii[i]
-            bboxes[i] = [0.5 * np.floor(2 * (cx - r)),
-                         0.5 * np.floor(2 * (cy - r)),
-                         0.5 * np.ceil(2 * (cx + r)),
-                         0.5 * np.ceil(2 * (cy + r))]
-            # widen slightly bboxes with zero area
-            if bboxes[i, 0] == bboxes[i, 2]:
-                bboxes[i, 0] -= 0.5
-                bboxes[i, 2] += 0.5
-            if bboxes[i, 1] == bboxes[i, 3]:
-                bboxes[i, 1] -= 0.5
-                bboxes[i, 3] += 0.5
-
-        # if max side smaller than 5, then return
-        max_side = (bboxes[:, [2, 3]] - bboxes[:, [0, 1]]).max()
-        if max_side < 5:
-            return bboxes
+    # location points
+    unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna()
+    
+    # create new columns with top left corner of small bbox containing the
+    # location
+    unique_locations['bottom_left_lat'] = \
+        np.floor(unique_locations.lat / 4.5) * 4.5
+    unique_locations['bottom_left_lon'] = \
+        np.floor(unique_locations.lon / 4.5) * 4.5
+    unique_locations['top_right_lat'] = \
+        unique_locations['bottom_left_lat'] + 4.5
+    unique_locations['top_right_lon'] = \
+        unique_locations['bottom_left_lon'] + 4.5
+    
+    bboxes = unique_locations[['bottom_left_lat',
+                               'bottom_left_lon',
+                               'top_right_lat',
+                               'top_right_lon']]
+    
+    return bboxes.drop_duplicates().values
diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py
index 21d1ccc..0d3d9d1 100644
--- a/task_geo/dataset_builders/nasa/nasa_connector.py
+++ b/task_geo/dataset_builders/nasa/nasa_connector.py
@@ -81,6 +81,7 @@ def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
         f"{temporal_average}&{output_format}&"
         f"{user_community}&{user}"
     )
+    print(bbox)
 
     response = requests.get(url).json()
     data_json = requests.get(response['outputs']['json']).json()

From 4bd9baa2f648a215189b342231a5e90a9ae76e40 Mon Sep 17 00:00:00 2001
From: DanielRobertNicoud
 <59664846+DanielRobertNicoud@users.noreply.github.com>
Date: Sun, 10 May 2020 23:29:31 +0200
Subject: [PATCH 13/13] Lint fixing

---
 task_geo/dataset_builders/nasa/area_partition.py | 6 +++---
 task_geo/dataset_builders/nasa/nasa_connector.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/task_geo/dataset_builders/nasa/area_partition.py b/task_geo/dataset_builders/nasa/area_partition.py
index 9b15c29..e779c21 100644
--- a/task_geo/dataset_builders/nasa/area_partition.py
+++ b/task_geo/dataset_builders/nasa/area_partition.py
@@ -19,7 +19,7 @@ def area_partition(df_loc):
 
     # location points
     unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna()
-    
+
     # create new columns with top left corner of small bbox containing the
     # location
     unique_locations['bottom_left_lat'] = \
@@ -30,10 +30,10 @@ def area_partition(df_loc):
         unique_locations['bottom_left_lat'] + 4.5
     unique_locations['top_right_lon'] = \
         unique_locations['bottom_left_lon'] + 4.5
-    
+
     bboxes = unique_locations[['bottom_left_lat',
                                'bottom_left_lon',
                                'top_right_lat',
                                'top_right_lon']]
-    
+
     return bboxes.drop_duplicates().values
diff --git a/task_geo/dataset_builders/nasa/nasa_connector.py b/task_geo/dataset_builders/nasa/nasa_connector.py
index 0d3d9d1..7bf20f7 100644
--- a/task_geo/dataset_builders/nasa/nasa_connector.py
+++ b/task_geo/dataset_builders/nasa/nasa_connector.py
@@ -113,15 +113,15 @@ def match_grid_point(locations, df_data):
     pd.DataFrame
         Output dataset.
 
-    """    
+    """
     data = []
     for row in locations.itertuples():
         lat = 0.5 * round(2 * (row.lat - 0.25)) + 0.25
         lon = 0.5 * round(2 * (row.lon - 0.25)) + 0.25
-        df_loc = df_data[(df_data.lat==lat) & (df_data.lon==lon)].copy()
+        df_loc = df_data[(df_data.lat == lat) & (df_data.lon == lon)].copy()
         df_loc.lat = row.lat
         df_loc.lon = row.lon
-        
+
         data.append(df_loc)
     return pd.concat(data).reset_index(drop=True, inplace=False)