# Install modules
import os
import json
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pystac_client
import planetary_computer as pc
import plotly.express as px
import plotly.io as pio
import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.inspection import permutation_importance
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from pystac.extensions.eo import EOExtension as eo

# setup renderer
if 'google.colab' in str(get_ipython()):
    pio.renderers.default = "colab"
else:
    pio.renderers.default = "jupyterlab"

rng = np.random.RandomState(0)

!pip install rasterstats
from rasterstats import zonal_stats

Collecting rasterstats
  Downloading rasterstats-0.19.0-py3-none-any.whl (16 kB)
Requirement already satisfied: rasterio>=1.0 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.3.4)
Requirement already satisfied: fiona in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.9.0)
Requirement already satisfied: affine in /opt/conda/lib/python3.10/site-packages (from rasterstats) (2.4.0)
Requirement already satisfied: numpy>=1.9 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.23.5)
Requirement already satisfied: cligj>=0.4 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (0.7.2)
Requirement already satisfied: click>7.1 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (8.1.3)
Collecting simplejson
  Downloading simplejson-3.19.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (137 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.9/137.9 kB 11.3 MB/s eta 0:00:00
Requirement already satisfied: shapely in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.8.5.post1)
Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (65.6.3)
Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (2022.12.7)
Requirement already satisfied: attrs in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (22.2.0)
Requirement already satisfied: snuggs>=1.4.1 in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (1.4.7)
Requirement already satisfied: click-plugins in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (1.1.1)
Requirement already satisfied: munch>=2.3.2 in /opt/conda/lib/python3.10/site-packages (from fiona->rasterstats) (2.5.0)
Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from munch>=2.3.2->fiona->rasterstats) (1.16.0)
Requirement already satisfied: pyparsing>=2.1.6 in /opt/conda/lib/python3.10/site-packages (from snuggs>=1.4.1->rasterio>=1.0->rasterstats) (3.0.9)
Installing collected packages: simplejson, rasterstats
Successfully installed rasterstats-0.19.0 simplejson-3.19.1


# Import data for Jupyter Notebook

os.chdir("/home/jovyan/work/GEOG3300_Assignment_Folder/central_asia_aoi/central_asia_aoi")

assignment_path = os.path.join(os.getcwd())

os.listdir(assignment_path)

['central_asia_aoi_3_crop_code.cpg',
 'central_asia_aoi_3_crop_code.shx',
 'central_asia_aoi_3_crop_code.shp',
 'central_asia_aoi_3_crop_code.dbf',
 'central_asia_aoi_2_crop_code.gpkg',
 'central_asia_aoi_3_crop_code.prj',
 'central_asia_aoi_4_crop_code.cpg',
 'central_asia_aoi_4_crop_code.shx',
 'central_asia_aoi_4_crop_code.dbf',
 'central_asia_aoi_4_crop_code.prj',
 'central_asia_aoi_1_crop_code.gpkg',
 'central_asia_aoi_4_crop_code.shp']


# Create a path to the working folder
assignment_path = os.path.join(os.getcwd())

# List the files in the working folder to check if we have the correct files in the correct folder
os.listdir(assignment_path)

['central_asia_aoi_3_crop_code.cpg',
 'central_asia_aoi_3_crop_code.shx',
 'central_asia_aoi_3_crop_code.shp',
 'central_asia_aoi_3_crop_code.dbf',
 'central_asia_aoi_2_crop_code.gpkg',
 'central_asia_aoi_3_crop_code.prj',
 'central_asia_aoi_4_crop_code.cpg',
 'central_asia_aoi_4_crop_code.shx',
 'central_asia_aoi_4_crop_code.dbf',
 'central_asia_aoi_4_crop_code.prj',
 'central_asia_aoi_1_crop_code.gpkg',
 'central_asia_aoi_4_crop_code.shp']


# Create a path to each vector dataset containing field data. Read each dataset into separate Geopanda's geodataframes named after "polygon" numbers

pol1_path = os.path.join(os.getcwd(), "central_asia_aoi_1_crop_code.gpkg")
pol1_gdf = gpd.read_file(pol1_path)

pol2_path = os.path.join(os.getcwd(), "central_asia_aoi_2_crop_code.gpkg")
pol2_gdf = gpd.read_file(pol2_path)

pol3_path = os.path.join(os.getcwd(), "central_asia_aoi_3_crop_code.shp")
pol3_gdf = gpd.read_file(pol3_path)

pol4_path = os.path.join(os.getcwd(), "central_asia_aoi_4_crop_code.shp")
pol4_gdf = gpd.read_file(pol4_path)


# Insepct that pol1 has been read correctly by printing the type of data and reading the first few rows 

print(type(pol1_gdf))
pol1_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>


# Insepct that pol2 has been read correctly by printing the type of data and the first few rows 

print(type(pol2_gdf))
pol2_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>


# Insepct that pol3 has been read correctly by printing the type of data and the first few rows 

print(type(pol3_gdf))
pol3_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>


# Insepct that pol4 has been read correctly by printing the type of data and the first few rows 

print(type(pol4_gdf))
pol4_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>


# Visualize pol1 in an interactive map

m1 = pol1_gdf.explore(column="crop_1", categorical=True, cmap = ["#232727 ", "#9EB400", "#A90BD3", "red", "green", "#CC8500 "], tiles = "OpenStreetMap")
m1


# Visualize pol2 in an interactive map

m2 = pol2_gdf.explore(column="crop_1", categorical=True, cmap = ["#260BD3", "brown", "#232727", "#9EB400", "red", "#04400B ","purple","#CC8500 "], tiles = "OpenStreetMap")
m2


# Visualize pol3 in an interactive map

m3 = pol3_gdf.explore(column="crop_1", categorical=True, cmap = ["#232727  ", "#CC8500 "], tiles = "OpenStreetMap")
m3


# Visualize pol4 in an interactive map

m4 = pol4_gdf.explore(column="crop_1", categorical=True, cmap = ["#260BD3", "#232727", "#D800CE ", "red", "purple", "#CC8500 "], tiles = "OpenStreetMap")
m4


# Check the CRS of Pol1.
print(f"The crs of Polygon 1 is {pol1_gdf.crs}")

# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 1:")
pol1_gdf.groupby('crop_1').count().loc[:,"crop1_code"]

The crs of Polygon 1 is epsg:4326
The number of samples by crop type for Polygon 1:

crop_1
cotton     34
maize       1
onions      1
orchard     1
rice        2
wheat      35
Name: crop1_code, dtype: int64


# Check the CRS of Pol2.
print(f"The crs of Polygon 2 is {pol2_gdf.crs}")

# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 2:")
pol2_gdf.groupby('crop_1').count().loc[:,"crop1_code"]

The crs of Polygon 2 is epsg:4326
The number of samples by crop type for Polygon 2:

crop_1
alfalfa       22
beans          2
cotton         1
maize          9
orchard       15
vegetables     7
vineyard      26
wheat         23
Name: crop1_code, dtype: int64


# Check the CRS of Pol3.
print(f"The crs of Polygon 3 is {pol3_gdf.crs}")

# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 3:")
pol3_gdf.groupby('crop_1').count().loc[:,"crop1_code"]

The crs of Polygon 3 is epsg:4326
The number of samples by crop type for Polygon 3:

crop_1
cotton    64
wheat     55
Name: crop1_code, dtype: int64


# Check the CRS of Pol4.
print(f"The crs of Polygon 4 is {pol4_gdf.crs}")

# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 4:")
pol4_gdf.groupby('crop_1').count().loc[:,"crop1_code"]

The crs of Polygon 4 is epsg:4326
The number of samples by crop type for Polygon 4:

crop_1
alfalfa      1
cotton      55
fallow       3
orchard     12
vineyard     1
wheat       75
Name: crop1_code, dtype: int64


# Create the smallest bounding box geometry around pol1
pol1_env = gpd.GeoSeries(pol1_gdf.geometry.unary_union).envelope

# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol1_env.explore(m=m1, color="red", style_kwds={"fillOpacity": 0})


# Set the CRS of pol1 bounding box to EPSG:4326
pol1_env = pol1_env.set_crs("EPSG:4326")

# Print the new CRS and check that the type of data is a GeoSeries object
print(pol1_env.crs)
print(type(pol1_env))

EPSG:4326
<class 'geopandas.geoseries.GeoSeries'>


# Create the smallest bounding box geometry around pol2
pol2_env = gpd.GeoSeries(pol2_gdf.geometry.unary_union).envelope

# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol2_env.explore(m=m2, color="red", style_kwds={"fillOpacity": 0})


# Set the CRS of pol2 bounding box to EPSG:4326
pol2_env = pol2_env.set_crs("EPSG:4326")

# Print the new CRS and check that the type of data is a GeoSeries object
print(pol2_env.crs)
print(type(pol2_env))

EPSG:4326
<class 'geopandas.geoseries.GeoSeries'>


# Create the smallest bounding box geometry around pol3
pol3_env = gpd.GeoSeries(pol3_gdf.geometry.unary_union).envelope

# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol3_env.explore(m=m3, color="red", style_kwds={"fillOpacity": 0})


# Set the CRS of pol1 bounding box to EPSG:4326
pol3_env = pol3_env.set_crs("EPSG:4326")

# Print the new CRS and check that the type of data is a GeoSeries object
print(pol3_env.crs)
print(type(pol3_env))

EPSG:4326
<class 'geopandas.geoseries.GeoSeries'>


# Create the smallest bounding box geometry around pol4
pol4_env = gpd.GeoSeries(pol4_gdf.geometry.unary_union).envelope

# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol4_env.explore(m=m4, color="red", style_kwds={"fillOpacity": 0})


# Set the CRS of pol1 bounding box to EPSG:4326
pol4_env = pol4_env.set_crs("EPSG:4326")

# Print the new CRS and check that the type of data is a GeoSeries object
print(pol4_env.crs)
print(type(pol4_env))

EPSG:4326
<class 'geopandas.geoseries.GeoSeries'>


# Transform the pol1 bounding box to EPSG:4326
pol1_env = pol1_env.to_crs("EPSG:4326")

# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m1e = pol1_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m1e


# Transform the pol2 bounding box to EPSG:4326
pol2_env = pol2_env.to_crs("EPSG:4326")

# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m2e = pol2_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m2e


# Transform the pol3 bounding box to EPSG:4326
pol3_env = pol3_env.to_crs("EPSG:4326")

# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m3e = pol3_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m3e


# Transform the pol4 bounding box to EPSG:4326
pol4_env = pol4_env.to_crs("EPSG:4326")

# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m4e = pol4_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m4e


# Open a connection to the Microsoft Planetary Computer's root STAC catalog

pc_catalog = pystac_client.Client.open(
    url="https://planetarycomputer.microsoft.com/api/stac/v1",)


# Download NDVI data for Polygon 1

# Get the bounding box as a shapely object
pol1_env_shapely = pol1_env[0]

# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr1 = []

# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018

# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol1_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
    least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
    least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)
    
    # For the first month get the shape of the array
    # For subsequent months resample all arrays to this shape
    if i == 5:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol1_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            red = src.read(1, window=aoi_window)
            
            # get affine transform for the windowed read
            # this allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # create a copy of the metadata for saving later
            out_meta1 = src.meta
            
            # get the shape of the red ndarray
            # force all other rasters to match this shape
            out_shape1 = red.shape
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol1_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            nir = src.read(1, out_shape=out_shape1, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)

    else:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol1_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            red = src.read(1, out_shape=out_shape1, window=aoi_window)
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol1_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            nir = src.read(1, out_shape=out_shape1, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)
        
    # Append NDVI ndarray to list
    ndvi_arr1.append(ndvi)

# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked1 = np.stack(ndvi_arr1, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked1.shape}")

# Inspect the metadata
print(out_meta1)

# Update meta for saving
out_meta1["dtype"] = "float32"
out_meta1["count"] = 6
out_meta1["height"] = ndvi_arr1[0].shape[0]
out_meta1["width"] = ndvi_arr1[0].shape[1]
out_meta1["transform"] = win_transform
print(out_meta1)

# Visualize data imported by an animated time-series diagram
figNDVI1 = px.imshow(
    ndvi_stacked1, 
    animation_frame=0,
    color_continuous_scale="viridis", 
    range_color=[0, 0.8],
    height = 800,
    width = 600)
figNDVI1.update_xaxes(showticklabels=False)
figNDVI1.update_yaxes(showticklabels=False)
figNDVI1.show()

processing month 5
processing month 6
processing month 7
processing month 8
processing month 9
processing month 10
the shape of ndvi_arr is (6, 487, 315)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 600000.0,
       0.0, -10.0, 4500000.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 315, 'height': 487, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 683405.9436293562,
       0.0, -10.0, 4496096.317958007)}


# Download Red-edge data for Polygon 1

# Get the bounding box as a shapely object
pol1_env_shapely = pol1_env[0]

# Create an empty list to store ndarray of monthly ndvi values
rededge_arr1 = []

months = range(5, 11)

year = 2018
  
# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol1_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
    least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)

    # For the first month get the shape of the array
    # For subsequent months resample all arrays to this shape      
    if i == 5:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol1_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # Create a copy of the metadata for saving later
            out_meta_rededge1 = src.meta
            
            # Get the shape of the rededge ndarray
            # Force all other rasters to match this shape
            out_shape_rededge8A = rededge8A.shape
            

    else:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol1_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
           
        rededge8A = rededge8A.astype("float32")
        
    # Append rededge ndarray to list
    rededge_arr1.append(rededge8A)

# Stack the rededge ndarray's to create a multiband raster
rededge_stacked1 = np.stack(rededge_arr1, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked1.shape}")

# Inspect the metadata
print(out_meta_rededge1)

# Update meta for saving
out_meta_rededge1["dtype"] = "float32"
out_meta_rededge1["count"] = 6
out_meta_rededge1["height"] = rededge_arr1[0].shape[0]
out_meta_rededge1["width"] = rededge_arr1[0].shape[1]
out_meta_rededge1["transform"] = win_transform
print(out_meta_rededge1)

# Visualize data imported by an animated time-series diagram
figRE1 = px.imshow(
    rededge_stacked1, 
    animation_frame=0,
    color_continuous_scale="IceFire", 
    range_color=[0, 5000],
    height = 800,
    width = 600)
figRE1.update_xaxes(showticklabels=False)
figRE1.update_yaxes(showticklabels=False)
figRE1.show()

processing month 5
processing month 6
processing month 7
processing month 8
processing month 9
processing month 10
the shape of rededge_arr is (6, 244, 158)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 600000.0,
       0.0, -20.0, 4500000.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 158, 'height': 244, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 683405.9436293562,
       0.0, -20.0, 4496096.317958007)}


# Download NDVI data for Polygon 2

# Get the bounding box as a shapely object
pol2_env_shapely = pol2_env[0]

# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr2 = []

# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018

# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol2_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
    least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
    least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)

    # For the first month get the shape of the array
    # For subsequent months resample all arrays to this shape
    if i == 5:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol2_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            red = src.read(1, window=aoi_window)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # Create a copy of the metadata for saving later
            out_meta2 = src.meta
            
            # Get the shape of the red ndarray
            # Force all other rasters to match this shape
            out_shape2 = red.shape
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol2_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            nir = src.read(1, out_shape=out_shape2, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)

    else:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol2_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            red = src.read(1, out_shape=out_shape2, window=aoi_window)
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol2_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            nir = src.read(1, out_shape=out_shape2, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)
        
    # Append NDVI ndarray to list
    ndvi_arr2.append(ndvi)

# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked2 = np.stack(ndvi_arr2, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked2.shape}")

# Inspect the metadata
print(out_meta2)

# Update meta for saving
out_meta2["dtype"] = "float32"
out_meta2["count"] = 6
out_meta2["height"] = ndvi_arr2[0].shape[0]
out_meta2["width"] = ndvi_arr2[0].shape[1]
out_meta2["transform"] = win_transform
print(out_meta2)

# Visualize data imported by an animated time-series diagram
figNDVI2 = px.imshow(
    ndvi_stacked2, 
    animation_frame=0,
    color_continuous_scale="viridis", 
    range_color=[0, 0.8],
    height = 800,
    width = 600)
figNDVI2.update_xaxes(showticklabels=False)
figNDVI2.update_yaxes(showticklabels=False)
figNDVI2.show()

processing month 5
processing month 6
processing month 7
processing month 8
processing month 9
processing month 10
the shape of ndvi_arr is (6, 906, 1118)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 300000.0,
       0.0, -10.0, 4400040.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 1118, 'height': 906, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 314518.683904971,
       0.0, -10.0, 4338935.877525085)}


# Download Red-edge data for Polygon 2

# Get the bounding box as a shapely object
pol2_env_shapely = pol2_env[0]

# Create an empty list to store ndarray of monthly ndvi values
rededge_arr2 = []

months = range(5, 11)

year = 2018
  
# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol2_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
    least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)

    # For the first month get the shape of the array
    # For subsequent months resample all arrays to this shape    

    if i == 5:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol2_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # Create a copy of the metadata for saving later
            out_meta_rededge2 = src.meta
            
            # Get the shape of the rededge ndarray
            # Force all other rasters to match this shape
            out_shape_rededge8A2 = rededge8A.shape
            

    else:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol2_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A2)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
           
        rededge8A = rededge8A.astype("float32")
        
    # Append rededge ndarray to list
    rededge_arr2.append(rededge8A)

# Stack the rededge ndarray's to create a multiband raster
rededge_stacked2 = np.stack(rededge_arr2, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked2.shape}")

# Inspect the metadata
print(out_meta_rededge2)

# Update meta for saving
out_meta_rededge2["dtype"] = "float32"
out_meta_rededge2["count"] = 6
out_meta_rededge2["height"] = rededge_arr2[0].shape[0]
out_meta_rededge2["width"] = rededge_arr2[0].shape[1]
out_meta_rededge2["transform"] = win_transform
print(out_meta_rededge2)

# Visualize data imported by an animated time-series diagram
figRE2 = px.imshow(
    rededge_stacked2, 
    animation_frame=0, # here we are specifying axis=2 which is the bands (weeks) dimension.
    color_continuous_scale="IceFire", 
    range_color=[0, 5000],
    height = 800,
    width = 600)
figRE2.update_xaxes(showticklabels=False)
figRE2.update_yaxes(showticklabels=False)
figRE2.show()

processing month 5
processing month 6
processing month 7
processing month 8
processing month 9
processing month 10
the shape of rededge_arr is (6, 453, 559)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 300000.0,
       0.0, -20.0, 4400040.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 559, 'height': 453, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 314518.683904971,
       0.0, -20.0, 4338935.877525085)}


# Download NDVI data for Polygon 3

# Get the bounding box as a shapely object
pol3_env_shapely = pol3_env[0]

# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr3 = []

# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018

# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol3_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
    least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
    least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)

    # For the first month get the shape of the array
    # For subsequent months resample all arrays to this shape
    if i == 5:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol3_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            red = src.read(1, window=aoi_window)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # Create a copy of the metadata for saving later
            out_meta3 = src.meta
            
            # Get the shape of the red ndarray
            # Force all other rasters to match this shape
            out_shape3 = red.shape
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol3_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            nir = src.read(1, out_shape=out_shape3, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)

    else:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol3_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            red = src.read(1, out_shape=out_shape3, window=aoi_window)
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol3_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            nir = src.read(1, out_shape=out_shape3, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)
        
    # Append NDVI ndarray to list
    ndvi_arr3.append(ndvi)

# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked3 = np.stack(ndvi_arr3, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked3.shape}")

# Inspect the metadata
print(out_meta3)

# Update meta for saving
out_meta3["dtype"] = "float32"
out_meta3["count"] = 6
out_meta3["height"] = ndvi_arr3[0].shape[0]
out_meta3["width"] = ndvi_arr3[0].shape[1]
out_meta3["transform"] = win_transform
print(out_meta3)

# Visualize data imported by an animated time-series diagram
figNDVI3 = px.imshow(
    ndvi_stacked3, 
    animation_frame=0,
    color_continuous_scale="viridis", 
    range_color=[0, 0.4],
    height = 800,
    width = 600)
figNDVI3.update_xaxes(showticklabels=False)
figNDVI3.update_yaxes(showticklabels=False)
figNDVI3.show()

processing month 5
processing month 6
processing month 7
processing month 8
processing month 9
processing month 10
the shape of ndvi_arr is (6, 837, 768)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 600000.0,
       0.0, -10.0, 4400040.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 768, 'height': 837, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 682112.5818372199,
       0.0, -10.0, 4344341.2613596255)}


# Download Red-edge data for Polygon 3

# Get the bounding box as a shapely object
pol3_env_shapely = pol3_env[0]

# Create an empty list to store ndarray of monthly ndvi values
rededge_arr3 = []

months = range(5, 11)

year = 2018
  
# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol3_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
    least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)

    # For the first month get the shape of the array
    # For subsequent months resample all arrays to this shape    
    if i == 5:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol3_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # Create a copy of the metadata for saving later
            out_meta_rededge3 = src.meta
            
            # Get the shape of the rededge ndarray
            # Force all other rasters to match this shape
            out_shape_rededge8A3 = rededge8A.shape
            

    else:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol3_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A3)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
           
        rededge8A = rededge8A.astype("float32")
        
    # Append rededge ndarray to list
    rededge_arr3.append(rededge8A)

# Stack the rededge ndarray's to create a multiband raster
rededge_stacked3 = np.stack(rededge_arr3, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked3.shape}")

# Inspect the metadata
print(out_meta_rededge3)

# Update meta for saving
out_meta_rededge3["dtype"] = "float32"
out_meta_rededge3["count"] = 6
out_meta_rededge3["height"] = rededge_arr3[0].shape[0]
out_meta_rededge3["width"] = rededge_arr3[0].shape[1]
out_meta_rededge3["transform"] = win_transform
print(out_meta_rededge3)

# Visualize data imported by an animated time-series diagram
figRE3 = px.imshow(
    rededge_stacked3, 
    animation_frame=0,
    color_continuous_scale="IceFire", 
    range_color=[0, 5000],
    height = 800,
    width = 600)
figRE3.update_xaxes(showticklabels=False)
figRE3.update_yaxes(showticklabels=False)
figRE3.show()

processing month 5
processing month 6
processing month 7
processing month 8
processing month 9
processing month 10
the shape of rededge_arr is (6, 419, 384)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 600000.0,
       0.0, -20.0, 4400040.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 384, 'height': 419, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 682112.5818372199,
       0.0, -20.0, 4344341.2613596255)}


# Download NDVI data for Polygon 4

# Get the bounding box as a shapely object
pol4_env_shapely = pol4_env[0]

# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr4 = []

# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018

# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol4_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
    least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
    least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)

    # For the first month get the shape of the array
    # For subsequent months resample all arrays to this shape
    if i == 5:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol4_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            red = src.read(1, window=aoi_window)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # Create a copy of the metadata for saving later
            out_meta4 = src.meta
            
            # Get the shape of the red ndarray
            # Force all other rasters to match this shape
            out_shape4 = red.shape
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol4_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            nir = src.read(1, out_shape=out_shape4, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)

    else:
        # Read red band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_red_href) as src:
            aoi_bounds = features.bounds(pol4_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            red = src.read(1, out_shape=out_shape4, window=aoi_window)
            
        # Read nir band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_nir08_href) as src:
            aoi_bounds = features.bounds(pol4_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            # Here we pass out_shape into read to define the shape of the array data is read into
            nir = src.read(1, out_shape=out_shape4, window=aoi_window)
    
        # Compute NDVI
        # Cast to float as NDVI is bound between -1 and 1 
        nir = nir.astype("float32")
        red = red.astype("float32")
        ndvi = (nir - red) / (nir + red)
        
    # Append NDVI ndarray to list
    ndvi_arr4.append(ndvi)

# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked4 = np.stack(ndvi_arr4, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked4.shape}")

# Inspect the metadata
print(out_meta4)

# Update meta for saving
out_meta4["dtype"] = "float32"
out_meta4["count"] = 6
out_meta4["height"] = ndvi_arr4[0].shape[0]
out_meta4["width"] = ndvi_arr4[0].shape[1]
out_meta4["transform"] = win_transform
print(out_meta4)

# Visualize data imported by an animated time-series diagram
figNDVI4 = px.imshow(
    ndvi_stacked4, 
    animation_frame=0,
    color_continuous_scale="viridis", 
    range_color=[0, 0.4],
    height = 800,
    width = 600)
figNDVI4.update_xaxes(showticklabels=False)
figNDVI4.update_yaxes(showticklabels=False)
figNDVI4.show()

processing month 5
processing month 6
processing month 7

/tmp/ipykernel_58/1739019700.py:121: RuntimeWarning:

invalid value encountered in divide

processing month 8
processing month 9
processing month 10
the shape of ndvi_arr is (6, 874, 379)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 699960.0,
       0.0, -10.0, 4400040.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 379, 'height': 874, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 696976.8515412887,
       0.0, -10.0, 4298978.419482002)}


# Download Red-edge data for Polygon 4

# Get the bounding box as a shapely object
pol4_env_shapely = pol4_env[0]

# Create an empty list to store ndarray of monthly ndvi values
rededge_arr4 = []

months = range(5, 11)

year = 2018
  
# Create a for loop that loops to download data for each month 
for i in months:
    print(f"processing month {i}")
    
    # Convert numeric month indicators from single digit (1)
    # to two digits (e.g. 01) to match the format of time query
    if i < 10:
        mnth = str(0) + str(i)
    else:
        mnth = str(i)
    
    # Create month indicators for the start of the next month
    # to constrain the search to one month at a time
    if i < 9:
        mnth_next = str(0) + str(i + 1)
    else:
        mnth_next = str(i + 1)
    
    # Create a time of interest search for each month in turn
    time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
    
    search = pc_catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=pol4_env_shapely,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}})

    # Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
    items = search.item_collection()
    
    # Get item with the lowest cloud cover for each month
    
    # Create an empty list to append the lowest cloud cover in the item collection to the list
    cloud_cover = []
    for cld in items:
        cloud_cover.append(eo.ext(cld).cloud_cover)
        
    # Find the index location in the list of the asset with lowest cloud cover
    min_cloud_cover = min(cloud_cover)
    min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
    
    # Get the Sentinel-2 asset with the lowest cloud cover
    least_cloudy = items[min_cloud_cover_idx]
    
    # Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
    least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)

    if i == 5:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol4_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window)
            
            # Get affine transform for the windowed read
            # this allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
            
            # Create a copy of the metadata for saving later
            out_meta_rededge4 = src.meta
            
            # Get the shape of the rededge ndarray
            # Force all other rasters to match this shape
            out_shape_rededge8A4 = rededge8A.shape
            

    else:
        # Read rededge8A band
        # Open a connection to the COG using its signed link
        with rasterio.open(least_cloudy_rededge8A_href) as src:
            aoi_bounds = features.bounds(pol4_env_shapely)
            warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
            aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
            rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A4)
            
            # Get affine transform for the windowed read
            # This allows us to reproject the windowed read of data from COG file
            src_transform = src.transform
            win_transform = src.window_transform(aoi_window)
           
        rededge8A = rededge8A.astype("float32")
        
    # Append rededge ndarray to list
    rededge_arr4.append(rededge8A)

# Stack the rededge ndarray's to create a multiband raster
rededge_stacked4 = np.stack(rededge_arr4, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked4.shape}")

# Inspect the metadata
print(out_meta_rededge4)

# Update meta for saving
out_meta_rededge4["dtype"] = "float32"
out_meta_rededge4["count"] = 6
out_meta_rededge4["height"] = rededge_arr4[0].shape[0]
out_meta_rededge4["width"] = rededge_arr4[0].shape[1]
out_meta_rededge4["transform"] = win_transform
print(out_meta_rededge4)

# Visualize data imported by an animated time-series diagram
figRE4 = px.imshow(
    rededge_stacked4, 
    animation_frame=0,
    color_continuous_scale="IceFire", 
    range_color=[0, 5000],
    height = 800,
    width = 600)
figRE4.update_xaxes(showticklabels=False)
figRE4.update_yaxes(showticklabels=False)
figRE4.show()

processing month 5
processing month 6
processing month 7
processing month 8
processing month 9
processing month 10
the shape of rededge_arr is (6, 437, 189)
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 699960.0,
       0.0, -20.0, 4400040.0)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 189, 'height': 437, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 696976.8515412887,
       0.0, -20.0, 4298978.419482002)}


# Transform Pol1 NDVI and Red-edge into a tabular format

# Compute zonal statistics that summarize the NDVI values for Pol1
zstats_meta = out_meta1
zstats_meta["count"] = 1

# Create a copy of pol1_gdf to save zonal_stats results to
pol1_gdf_ndvi = pol1_gdf.copy()

# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr1:  
    print(f"computing ndvi zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol1_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol1_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Compute zonal statistics that summarize the Red-edge values for Pol1
zstats_meta = out_meta_rededge1
zstats_meta["count"] = 1

# Create a copy of pol1_gdf to save zonal_stats results to
pol1_gdf_rededge = pol1_gdf.copy()

# Create a For-loop that computes Red-edgge zonal statistics for each month
month_idx = 5
for i in rededge_arr1:  
    print(f"computing rededge zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol1_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol1_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol1_NDVI_RE_df = pd.merge(left=pol1_gdf_ndvi, right=pol1_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
                           , right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"])  
# Display the first few rows
pol1_NDVI_RE_df.head()

computing ndvi zonal stats for month 5
computing ndvi zonal stats for month 6
computing ndvi zonal stats for month 7

/opt/conda/lib/python3.10/site-packages/rasterstats/io.py:328: NodataWarning:

Setting nodata to -999; specify nodata explicitly

computing ndvi zonal stats for month 8
computing ndvi zonal stats for month 9
computing ndvi zonal stats for month 10
computing rededge zonal stats for month 5
computing rededge zonal stats for month 6
computing rededge zonal stats for month 7
computing rededge zonal stats for month 8
computing rededge zonal stats for month 9
computing rededge zonal stats for month 10


# Transform Pol2 NDVI and Red-edge into a tabular format

# Compute zonal statistics that summarize the NDVI values for Pol2
zstats_meta = out_meta2
zstats_meta["count"] = 1

# Create a copy of pol2_gdf to save zonal_stats results to
pol2_gdf_ndvi = pol2_gdf.copy()

# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr2:  
    print(f"computing ndvi zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol2_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol2_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Compute zonal statistics that summarize the Red-edge values for Pol2
zstats_meta = out_meta_rededge2
zstats_meta["count"] = 1

# Create a copy of clum_aoi to save zonal_stats results to
pol2_gdf_rededge = pol2_gdf.copy()

# Create a For-loop that computes Red-edge zonal statistics for each month
month_idx = 5
for i in rededge_arr2:  
    print(f"computing rededge zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol2_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol2_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol2_NDVI_RE_df = pd.merge(left=pol2_gdf_ndvi, right=pol2_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
                           , right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]) 
 
# Display on the first few rows
pol2_NDVI_RE_df.head()

computing ndvi zonal stats for month 5
computing ndvi zonal stats for month 6
computing ndvi zonal stats for month 7
computing ndvi zonal stats for month 8
computing ndvi zonal stats for month 9
computing ndvi zonal stats for month 10
computing rededge zonal stats for month 5
computing rededge zonal stats for month 6
computing rededge zonal stats for month 7
computing rededge zonal stats for month 8
computing rededge zonal stats for month 9
computing rededge zonal stats for month 10


# Transform Pol3 NDVI and Red-edge into a tabular format

# Compute zonal statistics that summarize the NDVI values for Pol3
zstats_meta = out_meta3
zstats_meta["count"] = 1

# Create a copy of pol3_gdf to save zonal_stats results to
pol3_gdf_ndvi = pol3_gdf.copy()

# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr3:  
    print(f"computing ndvi zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol3_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol3_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Compute zonal statistics that summarize the NDVI values for Pol3
zstats_meta = out_meta_rededge3
zstats_meta["count"] = 1

# Create a copy of pol3_gdf to save zonal_stats results to
pol3_gdf_rededge = pol3_gdf.copy()

# Create a For-loop that computes Red-edge zonal statistics for each month
month_idx = 5
for i in rededge_arr3:  
    print(f"computing rededge zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol3_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol3_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol3_NDVI_RE_df = pd.merge(left=pol3_gdf_ndvi, right=pol3_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
                           , right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"])  

# Display on the first few rows
pol3_NDVI_RE_df.head()

computing ndvi zonal stats for month 5
computing ndvi zonal stats for month 6
computing ndvi zonal stats for month 7
computing ndvi zonal stats for month 8
computing ndvi zonal stats for month 9
computing ndvi zonal stats for month 10
computing rededge zonal stats for month 5
computing rededge zonal stats for month 6
computing rededge zonal stats for month 7
computing rededge zonal stats for month 8
computing rededge zonal stats for month 9
computing rededge zonal stats for month 10


# Transform Pol4 NDVI and Red-edge into a tabular format

# Compute zonal statistics that summarize the NDVI values for Pol4
zstats_meta = out_meta4
zstats_meta["count"] = 1

# Create a copy of pol4_gdf to save zonal_stats results to
pol4_gdf_ndvi = pol4_gdf.copy()

# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr4:  
    print(f"computing ndvi zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol4_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol4_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Compute zonal statistics that summarize the NDVI values for Pol4
zstats_meta = out_meta_rededge4
zstats_meta["count"] = 1

# Create a copy of pol4_gdf to save zonal_stats results to
pol4_gdf_rededge = pol4_gdf.copy()

# Create a For-loop that computes Red-edge zonal statistics for each month
month_idx = 5
for i in rededge_arr4:  
    print(f"computing rededge zonal stats for month {month_idx}")
    
    zstats = zonal_stats(
        pol4_gdf.to_crs(zstats_meta["crs"]), 
        i, 
        affine=zstats_meta["transform"], 
        stats=["mean"], 
        all_touched=True
    )
    
    df_zstats = pd.DataFrame(zstats)
    pol4_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0] 
    month_idx += 1

# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol4_NDVI_RE_df = pd.merge(left=pol4_gdf_ndvi, right=pol4_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
                           , right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"])  

# Display on the first few rows
pol4_NDVI_RE_df.head()

computing ndvi zonal stats for month 5
computing ndvi zonal stats for month 6
computing ndvi zonal stats for month 7
computing ndvi zonal stats for month 8
computing ndvi zonal stats for month 9
computing ndvi zonal stats for month 10
computing rededge zonal stats for month 5
computing rededge zonal stats for month 6
computing rededge zonal stats for month 7
computing rededge zonal stats for month 8
computing rededge zonal stats for month 9
computing rededge zonal stats for month 10


# Combine NDVI and Red-edge data from all polygons into one dataframe

# Create a list storing the dataframe for each polygon
gdf_list = [pol1_NDVI_RE_df, pol2_NDVI_RE_df,pol3_NDVI_RE_df,pol4_NDVI_RE_df]

# Concatenate each dataframe on to each other and store in AllPol dataframe
AllPol_gdf = pd.concat(gdf_list)

# Check the type of AllPol_gdf
print(type(AllPol_gdf))

# Check the shape of AllPol_gdf
print(AllPol_gdf.shape)

# Print the first few columns of AllPol_gdf
AllPol_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
(445, 20)


# Check how many rows with NaN values we have
nan_counts = AllPol_gdf.isnull().sum()
print(nan_counts)

# We have 33 rows with NaN

country        0
region         0
year           0
season         0
crop_1         0
aoi            0
crop1_code     0
geometry       0
ndvi_5        33
ndvi_6        33
ndvi_7        33
ndvi_8        33
ndvi_9        33
ndvi_10       33
rededge_5      0
rededge_6     33
rededge_7     33
rededge_8     33
rededge_9     33
rededge_10    33
dtype: int64


# Remove all rows with NaN
print("Shape of DataFrame before dropping zero yield values:")
print(AllPol_gdf.shape)
AllPol_gdf_dropped_NaN = AllPol_gdf.dropna()
print("Shape of DataFrame after dropping zero yield values:")
print(AllPol_gdf_dropped_NaN.shape)

# Number of rows decreased by 33, confirming we have removed all the rows with NaN values

Shape of DataFrame before dropping zero yield values:
(445, 20)
Shape of DataFrame after dropping zero yield values:
(412, 20)


# Check how many rows with outliers do we have

# Create a list of predictors to pass into the For-loop
predictors = ["ndvi_5","ndvi_6","ndvi_7","ndvi_8","ndvi_9","ndvi_10","rededge_5","rededge_6","rededge_7","rededge_8","rededge_9","rededge_10"]

# Generate a For-loop that counts the number of rows that have outliers for each predictor
for i in predictors:
  print(f"The number of rows with outliers in {i}")
  summary = AllPol_gdf_dropped_NaN[{i}].describe()
  mean = summary.loc['mean']
  std = summary.loc['std']
  threshold = 3 * std 

  # Define outliers
  outliers = (AllPol_gdf_dropped_NaN[[i]] > (mean + threshold)) | (AllPol_gdf_dropped_NaN[[i]] < (mean - threshold))

  # Compute the total number of rows with outliers
  rows_with_outliers = outliers.any(axis=1).sum()
  print(rows_with_outliers)

The number of rows with outliers in ndvi_5
0
The number of rows with outliers in ndvi_6
2
The number of rows with outliers in ndvi_7
1
The number of rows with outliers in ndvi_8
0
The number of rows with outliers in ndvi_9
0
The number of rows with outliers in ndvi_10
1
The number of rows with outliers in rededge_5
12
The number of rows with outliers in rededge_6
0
The number of rows with outliers in rededge_7
2
The number of rows with outliers in rededge_8
2
The number of rows with outliers in rededge_9
1
The number of rows with outliers in rededge_10
2

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning:

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.


# Remove outliers by crop type

# Create a list of predictors to pass into the For-loop
predictors = ["ndvi_5","ndvi_6","ndvi_7","ndvi_8","ndvi_9","ndvi_10","rededge_5","rededge_6","rededge_7","rededge_8","rededge_9","rededge_10"]

# Calculate the mean of each column
for i in predictors:
  print(AllPol_gdf_dropped_NaN[i].mean())

# Store mean values in a list
mean_columns = [0.37863035405212636,
0.29362544445278704,
0.30908943074915174,
0.3761947765611446,
0.39587637956960864,
0.31138927726383514,
4687.018390525821,
3417.233096856336,
3552.2705976999932,
3578.5544779844145,
3407.8627182745977,
2903.206591342297]

# Calculate the standard deviation of each column
for i in predictors:
  print(AllPol_gdf_dropped_NaN[i].std())

#Store standard deviation values in a list
sd_columns = [0.20280309247659548,
0.1680853065268984,
0.1559785137138195,
0.17190269643184122,
0.16819120363708812,
0.1533305540472219,
8063.2830106455,
678.5587102694418,
548.1344136832438,
502.9737064976784,
507.99505789788094,
420.736125187469]

0.37863035405212636
0.29362544445278704
0.30908943074915174
0.3761947765611446
0.39587637956960864
0.31138927726383514
4687.018390525821
3417.233096856336
3552.2705976999932
3578.5544779844145
3407.8627182745977
2903.206591342297
0.2028030924765954
0.16808530652689838
0.15597851371381952
0.1719026964318412
0.168191203637088
0.15333055404722185
8063.283010645502
678.5587102694416
548.1344136832438
502.97370649767817
507.99505789788066
420.73612518746904


# Set index as 0 to subset the position of mean columns and standard deviation columns throughout the loop
idx = 0
 
def remove_outliers(mean_sd_columns, k, threshold=3):
        mean = mean_columns[idx]
        std = sd_columns[idx]
        lower_bound = mean - (threshold * std)
        upper_bound = mean + (threshold * std)
        return mean_sd_columns[(mean_sd_columns[k] >= lower_bound) & (mean_sd_columns[k] <= upper_bound)]

filtered_gdf = AllPol_gdf_dropped_NaN

for k in predictors:
      # Generate a For-loop that removes all the rows with outliers in each predictor column
      filtered_gdf = remove_outliers(filtered_gdf, k)
      filtered_gdf.reset_index(drop=True, inplace=True)
      print(filtered_gdf.shape)

      # Increment index by 1 to subset the next mean and standard values in the lists
      idx = idx + 1

(412, 20)
(410, 20)
(409, 20)
(409, 20)
(409, 20)
(408, 20)
(396, 20)
(396, 20)
(395, 20)
(394, 20)
(393, 20)
(393, 20)


# Create a boxplot to determine the number of observations we have for each crop type

Crop_count = px.histogram(filtered_gdf, x="crop_1")
Crop_count.update_yaxes(title_text="Number of observations")
Crop_count.update_xaxes(title_text="Crop type")
Crop_count.show()


print(filtered_gdf.shape)

# From the boxplot, we found that the number of observations for rice, beans, maize, vegetables and fallows is insufficient to train a proper classification model; therefore, we dropped these crop types
condition_rice = filtered_gdf["crop_1"] == "rice"
condition_beans = filtered_gdf["crop_1"] == "beans"
condition_fallow= filtered_gdf["crop_1"] == "fallow"
condition_maize= filtered_gdf["crop_1"] == "maize"
condition_vegetables= filtered_gdf["crop_1"] == "vegetables"
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_rice].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_beans].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_fallow].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_maize].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_vegetables].index)

# Print the shape to check that 24 rows representing rice, beans, maize, vegetables, and allows were dropped
print(filtered_gdf.shape)

(393, 20)
(369, 20)

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.

/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.


# Create a new geodataframe that summarizes the average values of NDVI and Red-edge for each crop type

filtered_gdf_groups= filtered_gdf.groupby(["crop_1"])
filtered_gdf_groups.size()
filtered_gdf_groups.mean()
crop_averages = filtered_gdf_groups.mean()
crop_averages

/tmp/ipykernel_58/2790574965.py:5: FutureWarning:

The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

/tmp/ipykernel_58/2790574965.py:6: FutureWarning:

The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


# Transpose the geodataframe to enable visualization in line graphs

crop_averages_Transposed= crop_averages.T
crop_averages_T_NDVI = crop_averages_Transposed.copy()

# Drop the red-edge rows to visualize NDVI data
crop_averages_NDVI_ready = crop_averages_T_NDVI.drop(["crop1_code","rededge_5","rededge_6","rededge_7","rededge_8","rededge_9","rededge_10"],axis=0)
crop_averages_NDVI_ready


# Visualize the average NDVI data over time for each crop

# Create a list of crop names to pass into the For-loop
crop = ["alfalfa","cotton",	"orchard","vineyard",	"wheat"]

# Generate a For-loop that creates a line graph for each crop type displaying average NDVI data over time
for i in crop:
  i = px.line(
    data_frame=crop_averages_NDVI_ready, 
    x=["May", "June", "July", "August", "September", "October"],
    y=f"{i}",
    title = f"NDVI of <b>{i}</b> crop from May to October, 2018 in Central Asia")
  i.update_traces(line_color='#009b00', line_width=5)
  i.update_yaxes(title_text="NDVI")
  i.update_xaxes(title_text="Month")
  i.show()


# Transpose the geodataframe to allow visualization in line graphs
crop_averages_Transposed= crop_averages.T
crop_averages_T_Rededge = crop_averages_Transposed.copy()

# Drop the NDVI rows to visualize Red-edge data
crop_averages_Rededge_ready = crop_averages_T_Rededge.drop(["crop1_code","ndvi_5","ndvi_6","ndvi_7","ndvi_8","ndvi_9","ndvi_10"],axis=0)
crop_averages_Rededge_ready


# Visualize the average Red-edge data over time for each crop

# Create a list of crop names to pass into the For-loop
crop = ["alfalfa","cotton",	"orchard","vineyard",	"wheat"]

# Generate a For-loop that creates a line graph for each crop type displaying average Red-edge data over time

for y in crop:
  y = px.line(
    data_frame=crop_averages_Rededge_ready, 
    x=["May", "June", "July", "August", "September", "October"],
    y=f"{y}",
    title = f"Rededge of <b>{y}</b> crop from May to October, 2018 in Central Asia")
  y.update_traces(line_color='#FF0000', line_width=5)
  y.update_yaxes(title_text="Rededge")
  y.update_xaxes(title_text="Month")
  y.show()


# Pool vineyard, orchard and alfalfa together by changing their crop type label to "others"

filtered_gdf["crop_1"] = filtered_gdf["crop_1"].replace("vineyard", 'others')
filtered_gdf["crop_1"] = filtered_gdf["crop_1"].replace("orchard", 'others')
filtered_gdf["crop_1"] = filtered_gdf["crop_1"].replace("alfalfa", 'others')

# check the new crop types. There should be 3 crop types: wheat, cotton, others
filtered_gdf["crop_1"].value_counts()

wheat     165
cotton    134
others     70
Name: crop_1, dtype: int64


# Explore our NDVI and Red-edge data for each field polygon 
filtered_gdf.explore("crop_1", tiles="CartoDB dark_matter", cmap="tab20", categorical=True)


# Check that there are enough observations in each polygon (area of interest)
filtered_gdf.groupby("aoi").count().loc[:,"crop1_code"]

aoi
aoi_1     68
aoi_2     83
aoi_3    119
aoi_4     99
Name: crop1_code, dtype: int64


# Subset the geodataframe to create our x object, which should follow the format (samples, predictors)
x_sp = filtered_gdf.drop(["country", "region", "year", "season", "crop_1", "aoi", "crop1_code","geometry"], axis=1)

# Subset the geodataframe to create our y object, which should have only the ground-truth outcomes
y_sp = filtered_gdf.loc[:, "crop_1"]

# check the structure and shape of x
print(x_sp.head())
print(x_sp.shape)

# check the structure and shape of y
print(y_sp.head())
print(y_sp.shape)

     ndvi_5    ndvi_6    ndvi_7    ndvi_8    ndvi_9   ndvi_10    rededge_5  \
0  0.106648  0.058165  0.487728  0.649186  0.379399  0.350379  2111.467836   
1  0.480385  0.464282  0.248883  0.517827  0.623533  0.554184  2596.467532   
2  0.555527  0.467947  0.159104  0.327111  0.686653  0.565116  2813.687500   
3  0.113374  0.093075  0.718671  0.765948  0.417530  0.353729  2204.968421   
4  0.142455  0.103555  0.623383  0.732566  0.401693  0.330844  2286.989362   

     rededge_6    rededge_7    rededge_8    rededge_9   rededge_10  
0  2017.707602  3120.929825  3351.959064  2173.830409  2132.140351  
1  2192.253247  2360.298701  2547.110390  2883.032468  2709.240260  
2  2137.265625  2037.906250  2397.437500  3153.304688  2953.085938  
3  2184.100000  4534.073684  4058.252632  2242.368421  2256.947368  
4  2224.404255  4218.494681  3734.882979  2268.154255  2295.696809  
(369, 12)
0    cotton
1     wheat
2     wheat
3    cotton
4    cotton
Name: crop_1, dtype: object
(369,)


# To prevent data leakage, we will ensure that training and test samples will not have samples from the same area.
groups = filtered_gdf.loc[:, "aoi"]

# Use GroupShuffleSplit function to obtain 4 array-like objects: x_train_sp, x_test_sp, y_train_sp, y_test_sp
gss = GroupShuffleSplit(n_splits=1, test_size=.5, random_state=6)

for i, (train_index, test_index) in enumerate(gss.split(x_sp, y_sp, groups)):
    print(f"processing split {i}")
    x_train_sp = x_sp.iloc[train_index, :]
    x_test_sp = x_sp.iloc[test_index, :]
    y_train_sp = y_sp.iloc[train_index]
    y_test_sp = y_sp.iloc[test_index]

# Inspect the shape of each object
print(f"the size of the training features object is {x_train_sp.shape}") 
print(f"the size of the test features object is {x_test_sp.shape}")
print(f"the size of the training outcomes object is {y_train_sp.shape}")
print(f"the size of the test outcomes object is {y_test_sp.shape}")

processing split 0
the size of the training features object is (202, 12)
the size of the test features object is (167, 12)
the size of the training outcomes object is (202,)
the size of the test outcomes object is (167,)


# Check that both the training and test sets have all three types of crops
print("The training set has the following crop types:")
print(y_train_sp.value_counts())
print("")
print("The test set has the following crop types:")
print(y_test_sp.value_counts())

The training set has the following crop types:
wheat     78
cotton    65
others    59
Name: crop_1, dtype: int64

The test set has the following crop types:
wheat     87
cotton    69
others    11
Name: crop_1, dtype: int64


# Create and train a random forests model
rf_sp = RandomForestClassifier(n_estimators=200, random_state=6)
rf_sp.fit(x_train_sp, y_train_sp.astype(str))

RandomForestClassifier(n_estimators=200, random_state=6)

RandomForestClassifier(n_estimators=200, random_state=6)


# Using the random forest model that we created using the training set to predict the x_test_sp object 
y_pred_sp = rf_sp.predict(x_test_sp)

# Test and evaluate the model using a classification report, which compares the predictions of the x_test_sp and the ground truth data of y_test_sp
print(classification_report(y_test_sp, y_pred_sp))

              precision    recall  f1-score   support

      cotton       0.44      0.23      0.30        69
      others       0.05      0.09      0.06        11
       wheat       0.49      0.61      0.54        87

    accuracy                           0.42       167
   macro avg       0.33      0.31      0.30       167
weighted avg       0.44      0.42      0.41       167


# Evaluate the accuracy for specific crop types using a confusion matrix

# Create a list of crop types
labels = ["cotton","others","wheat"]

# Create a confusion matrix
cm = confusion_matrix(y_test_sp, y_pred_sp)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(text_kw={"fontsize":10}, xticks_rotation="vertical")
plt.show()


# Compute permutational importance to determine the predictive ability of each feature
p_imp = permutation_importance(rf_sp, x_sp, y_sp, scoring="accuracy", n_repeats=30, random_state=6)

# Visualize the permutational importance of each predictor in a barplot
columns = x_sp.columns
p_imp = abs(p_imp["importances_mean"])
p_imp_df = pd.DataFrame({"feature": columns, "importance": p_imp})
p_imp_df = p_imp_df.sort_values(by=["importance"], ascending=True)
fig = px.bar(p_imp_df, y="feature", x="importance", height=600)
fig.show()


# Create a map comparing between predictions made by the model and ground-truth data

all_preds_sp = rf_sp.predict(x_sp)

Prediction_map = filtered_gdf.copy()
Prediction_map["predicted"] = all_preds_sp.astype("str")
Prediction_map.replace({"predicted": labels}, inplace=True)
basemap = "https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}"
attribution = "Tiles &copy; Esri &mdash; Source: Esri, i-cubed, USDA, USGS, AEX, GeoEye, Getmapping, Aerogrid, IGN, IGP, UPR-EGP, and the GIS User Community"
Prediction_map.explore(column="predicted", cmap=["yellow","blue","red"], categorical=True, tiles=basemap, attr=attribution, tooltip=["crop_1", "predicted"])

	country	region	year	season	crop_1	aoi	crop1_code	geometry
0	Uzbekistan	Samarkand	2016	permanent	vineyard	aoi_2	6	MULTIPOLYGON (((66.98073 39.10310, 66.98211 39...
1	Uzbekistan	Samarkand	2016	unclear	vegetables	aoi_2	7	MULTIPOLYGON (((66.97909 39.10714, 66.97979 39...
2	Uzbekistan	Samarkand	2016	permanent	vineyard	aoi_2	6	MULTIPOLYGON (((66.97806 39.10726, 66.97809 39...
3	Uzbekistan	Samarkand	2016	summer	maize	aoi_2	5	MULTIPOLYGON (((66.98070 39.10780, 66.98098 39...
4	Uzbekistan	Samarkand	2016	winter	wheat	aoi_2	1	MULTIPOLYGON (((66.97461 39.10863, 66.97308 39...

	country	region	year	season	crop_1	aoi	geometry
0	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.19614 39.19174, 65.19652 39.18742...
1	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.19085 39.19327, 65.18710 39.19826...
2	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.19075 39.18957, 65.19072 39.18759...
3	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.18067 39.18739, 65.18743 39.18734...
4	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.18304 39.18958, 65.17674 39.18661...

	country	region	year	season	crop_1	aoi	crop1_code	geometry	ndvi_5	ndvi_6	ndvi_7	ndvi_8	ndvi_9	ndvi_10	rededge_5	rededge_6	rededge_7	rededge_8	rededge_9	rededge_10
0	Uzbekistan	Samarkand	2016	permanent	vineyard	aoi_2	6	MULTIPOLYGON (((66.98073 39.10310, 66.98211 39...	0.616800	0.616145	0.590545	0.648879	0.539297	0.474883	4109.843750	4027.875000	3997.031250	3880.135417	3364.979167	3095.583333
1	Uzbekistan	Samarkand	2016	unclear	vegetables	aoi_2	7	MULTIPOLYGON (((66.97909 39.10714, 66.97979 39...	0.748144	0.541868	0.463725	0.557448	0.545439	0.434635	4669.937500	4002.375000	3571.812500	4083.312500	3447.812500	3212.187500
2	Uzbekistan	Samarkand	2016	permanent	vineyard	aoi_2	6	MULTIPOLYGON (((66.97806 39.10726, 66.97809 39...	0.543793	0.591070	0.531800	0.591057	0.555157	0.434555	3846.788462	3834.692308	3762.076923	3600.009615	3303.230769	2993.182692
3	Uzbekistan	Samarkand	2016	summer	maize	aoi_2	5	MULTIPOLYGON (((66.98070 39.10780, 66.98098 39...	0.791644	0.348566	0.283063	0.206600	0.195828	0.156752	3883.125000	3146.000000	3264.750000	3496.875000	2841.750000	2042.500000
4	Uzbekistan	Samarkand	2016	winter	wheat	aoi_2	1	MULTIPOLYGON (((66.97461 39.10863, 66.97308 39...	0.675362	0.365412	0.420331	0.487360	0.336148	0.260112	4174.475000	3680.950000	3729.675000	3785.725000	3306.275000	3060.975000

	country	region	year	season	crop_1	aoi	geometry	ndvi_5	ndvi_6	ndvi_7	ndvi_8	ndvi_9	ndvi_10	rededge_5	rededge_6	rededge_7	rededge_8	rededge_9	rededge_10
0	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.19614 39.19174, 65.19652 39.18742...	0.651894	0.207635	0.195904	0.182068	0.199424	0.124098	3142.551669	4190.310016	4259.621622	4501.093800	4347.235294	2561.445151
1	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.19085 39.19327, 65.18710 39.19826...	0.108015	0.138196	0.205911	0.483191	0.478064	0.228358	3197.067308	3102.408654	3142.394231	3549.528846	3387.072115	2796.418269
2	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.19075 39.18957, 65.19072 39.18759...	0.186981	0.172990	0.211738	0.490073	0.518232	0.251647	3034.577626	3196.744292	3603.045662	3826.844749	3703.933790	3273.424658
3	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.18067 39.18739, 65.18743 39.18734...	0.174689	0.209438	0.244037	0.476990	0.510088	0.309258	3365.637555	3293.724891	3374.043668	3589.860262	3298.663755	2919.606987
4	Uzbekistan	Samarkand	2016	summer	cotton	aoi_3	POLYGON ((65.18304 39.18958, 65.17674 39.18661...	0.483318	0.245942	0.170634	0.122645	0.135018	0.094399	3451.836483	4281.904684	4405.020542	4188.260477	3437.099425	2390.975349

	country	region	year	season	crop_1	aoi	crop1_code	geometry	ndvi_5	ndvi_6	ndvi_7	ndvi_8	ndvi_9	ndvi_10	rededge_5	rededge_6	rededge_7	rededge_8	rededge_9	rededge_10
0	Uzbekistan	Kashkadarya	2018	winter	wheat	aoi_4	1	POLYGON ((65.30076 38.79538, 65.30125 38.79608...	0.110825	0.121411	0.299068	0.585074	0.741629	0.549816	2900.876404	2901.741573	3543.550562	4159.280899	4146.483146	2850.898876
1	Uzbekistan	Kashkadarya	2018	summer	cotton	aoi_4	0	POLYGON ((65.29607 38.79463, 65.29717 38.79513...	0.521030	0.345005	0.253342	0.240403	0.235322	0.190898	2970.420455	3903.886364	3709.397727	3917.454545	3781.590909	3302.147727
2	Uzbekistan	Kashkadarya	2018	winter	wheat	aoi_4	1	POLYGON ((65.28487 38.80078, 65.29384 38.80442...	0.301086	0.130437	0.239223	0.386766	0.454388	0.309634	3078.240000	3209.526667	3597.360000	3783.020000	3730.333333	2761.180000
3	Uzbekistan	Kashkadarya	2018	permanent	orchard	aoi_4	2	POLYGON ((65.29744 38.79249, 65.29819 38.79281...	0.339116	0.255328	0.133188	0.177755	0.448556	0.323423	3031.396552	3594.017241	2920.448276	3531.224138	3543.741379	2932.327586
4	Uzbekistan	Kashkadarya	2018	permanent	orchard	aoi_4	2	POLYGON ((65.29721 38.79514, 65.29837 38.79563...	0.341406	0.263521	0.075036	0.142118	0.516263	0.369137	3088.340426	3635.436170	2219.872340	3255.202128	3427.191489	2755.648936

Machine learning Model for Crop Type Classification in Central Asia

0. Crop type classification workflow diagram

1. Import data from MLHub and explore data. Create a bounding box for query
(Example from one of the 4 sampling areas)

2. Use the bounding box to get NDVI (left) and Rededge (right) data
during the cropping season from Sentinel-2 to use as predictors

3. Visualize the differences in band signatures of each crop type

4. Divide dataset into training and test set, then run the Random Forest Classifier.
Evaluate the model with classification report and confusion matrix

5. Compute permutation importance to determine the perforomance of each predictor

6. Visualize ground truth labels and predicted labels of each cropping field

Full Report: GEOG3300 Assignment - Machine Learning Model for Crop Type Classification in Central Asia¶

Sub-task 1: Data Import¶

Sub-task 2: Data Visualisation¶

Sub-task 3: Data Exploration¶

Sub-task 4: Data Transformation¶

Sub-task 5: Data Transformation¶

Sub-task 6: Model¶

Sub-task 7: Data Import¶

Sub-task 8: Data Transformation¶

Sub-task 9: Data Visualisation¶

Sub-task 10: Model - Train machine learning model¶

Sub-task 11: Model - Evaluate the machine learning model¶

	country	region	year	season	crop_1	aoi	crop1_code	geometry
0	Uzbekistan	Fergana	2018	summer	cotton	aoi_1	0	MULTIPOLYGON (((71.18975 40.59049, 71.18977 40...
1	Uzbekistan	Fergana	2018	double	wheat	aoi_1	1	MULTIPOLYGON (((71.19309 40.58909, 71.19210 40...
2	Uzbekistan	Fergana	2018	double	wheat	aoi_1	1	MULTIPOLYGON (((71.19293 40.58684, 71.19218 40...
3	Uzbekistan	Fergana	2018	double	wheat	aoi_1	1	MULTIPOLYGON (((71.19390 40.58708, 71.19298 40...
4	Uzbekistan	Fergana	2018	summer	cotton	aoi_1	0	MULTIPOLYGON (((71.17040 40.55147, 71.16873 40...

	crop1_code	ndvi_5	ndvi_6	ndvi_7	ndvi_8	ndvi_9	ndvi_10	rededge_5	rededge_6	rededge_7	rededge_8	rededge_9	rededge_10
crop_1
alfalfa	8.0	0.569191	0.421874	0.403822	0.399101	0.435692	0.379882	3801.269029	3799.404979	3856.133407	3760.713435	3645.027031	3265.225652
cotton	0.0	0.259064	0.189474	0.299315	0.382988	0.356697	0.264313	3132.246634	3267.360390	3600.293509	3592.284702	3319.144499	2790.055704
orchard	2.0	0.510183	0.411830	0.414758	0.469672	0.524347	0.416552	3620.155814	3626.364470	3550.457153	3658.882003	3564.856285	2988.295056
vineyard	6.0	0.527496	0.495214	0.470471	0.488136	0.468839	0.421426	3862.713535	3828.018298	3792.799446	3699.929657	3425.881815	3063.786412
wheat	1.0	0.381277	0.297124	0.250301	0.334811	0.402461	0.308856	3339.121626	3370.310844	3406.661813	3490.657762	3439.219007	2903.148407

Machine learning Model for Crop Type Classification in Central Asia

0. Crop type classification workflow diagram

1. Import data from MLHub and explore data. Create a bounding box for query (Example from one of the 4 sampling areas)

2. Use the bounding box to get NDVI (left) and Rededge (right) data during the cropping season from Sentinel-2 to use as predictors

3. Visualize the differences in band signatures of each crop type

4. Divide dataset into training and test set, then run the Random Forest Classifier. Evaluate the model with classification report and confusion matrix

5. Compute permutation importance to determine the perforomance of each predictor

6. Visualize ground truth labels and predicted labels of each cropping field

Full Report: GEOG3300 Assignment - Machine Learning Model for Crop Type Classification in Central Asia¶

Sub-task 1: Data Import¶

Sub-task 2: Data Visualisation¶

Sub-task 3: Data Exploration¶

Sub-task 4: Data Transformation¶

Sub-task 5: Data Transformation¶

Sub-task 6: Model¶

Sub-task 7: Data Import¶

Sub-task 8: Data Transformation¶

Sub-task 9: Data Visualisation¶

Sub-task 10: Model - Train machine learning model¶

Sub-task 11: Model - Evaluate the machine learning model¶

1. Import data from MLHub and explore data. Create a bounding box for query
(Example from one of the 4 sampling areas)

2. Use the bounding box to get NDVI (left) and Rededge (right) data
during the cropping season from Sentinel-2 to use as predictors

4. Divide dataset into training and test set, then run the Random Forest Classifier.
Evaluate the model with classification report and confusion matrix