# Install modules
import os
import json
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pystac_client
import planetary_computer as pc
import plotly.express as px
import plotly.io as pio
import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.inspection import permutation_importance
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from pystac.extensions.eo import EOExtension as eo
# setup renderer
if 'google.colab' in str(get_ipython()):
pio.renderers.default = "colab"
else:
pio.renderers.default = "jupyterlab"
rng = np.random.RandomState(0)
!pip install rasterstats
from rasterstats import zonal_stats
Collecting rasterstats Downloading rasterstats-0.19.0-py3-none-any.whl (16 kB) Requirement already satisfied: rasterio>=1.0 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.3.4) Requirement already satisfied: fiona in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.9.0) Requirement already satisfied: affine in /opt/conda/lib/python3.10/site-packages (from rasterstats) (2.4.0) Requirement already satisfied: numpy>=1.9 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.23.5) Requirement already satisfied: cligj>=0.4 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (0.7.2) Requirement already satisfied: click>7.1 in /opt/conda/lib/python3.10/site-packages (from rasterstats) (8.1.3) Collecting simplejson Downloading simplejson-3.19.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (137 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.9/137.9 kB 11.3 MB/s eta 0:00:00 Requirement already satisfied: shapely in /opt/conda/lib/python3.10/site-packages (from rasterstats) (1.8.5.post1) Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (65.6.3) Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (2022.12.7) Requirement already satisfied: attrs in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (22.2.0) Requirement already satisfied: snuggs>=1.4.1 in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (1.4.7) Requirement already satisfied: click-plugins in /opt/conda/lib/python3.10/site-packages (from rasterio>=1.0->rasterstats) (1.1.1) Requirement already satisfied: munch>=2.3.2 in /opt/conda/lib/python3.10/site-packages (from fiona->rasterstats) (2.5.0) Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from munch>=2.3.2->fiona->rasterstats) (1.16.0) Requirement already satisfied: pyparsing>=2.1.6 in /opt/conda/lib/python3.10/site-packages (from snuggs>=1.4.1->rasterio>=1.0->rasterstats) (3.0.9) Installing collected packages: simplejson, rasterstats Successfully installed rasterstats-0.19.0 simplejson-3.19.1
# Import data for Jupyter Notebook
os.chdir("/home/jovyan/work/GEOG3300_Assignment_Folder/central_asia_aoi/central_asia_aoi")
assignment_path = os.path.join(os.getcwd())
os.listdir(assignment_path)
['central_asia_aoi_3_crop_code.cpg', 'central_asia_aoi_3_crop_code.shx', 'central_asia_aoi_3_crop_code.shp', 'central_asia_aoi_3_crop_code.dbf', 'central_asia_aoi_2_crop_code.gpkg', 'central_asia_aoi_3_crop_code.prj', 'central_asia_aoi_4_crop_code.cpg', 'central_asia_aoi_4_crop_code.shx', 'central_asia_aoi_4_crop_code.dbf', 'central_asia_aoi_4_crop_code.prj', 'central_asia_aoi_1_crop_code.gpkg', 'central_asia_aoi_4_crop_code.shp']
# Create a path to the working folder
assignment_path = os.path.join(os.getcwd())
# List the files in the working folder to check if we have the correct files in the correct folder
os.listdir(assignment_path)
['central_asia_aoi_3_crop_code.cpg', 'central_asia_aoi_3_crop_code.shx', 'central_asia_aoi_3_crop_code.shp', 'central_asia_aoi_3_crop_code.dbf', 'central_asia_aoi_2_crop_code.gpkg', 'central_asia_aoi_3_crop_code.prj', 'central_asia_aoi_4_crop_code.cpg', 'central_asia_aoi_4_crop_code.shx', 'central_asia_aoi_4_crop_code.dbf', 'central_asia_aoi_4_crop_code.prj', 'central_asia_aoi_1_crop_code.gpkg', 'central_asia_aoi_4_crop_code.shp']
# Create a path to each vector dataset containing field data. Read each dataset into separate Geopanda's geodataframes named after "polygon" numbers
pol1_path = os.path.join(os.getcwd(), "central_asia_aoi_1_crop_code.gpkg")
pol1_gdf = gpd.read_file(pol1_path)
pol2_path = os.path.join(os.getcwd(), "central_asia_aoi_2_crop_code.gpkg")
pol2_gdf = gpd.read_file(pol2_path)
pol3_path = os.path.join(os.getcwd(), "central_asia_aoi_3_crop_code.shp")
pol3_gdf = gpd.read_file(pol3_path)
pol4_path = os.path.join(os.getcwd(), "central_asia_aoi_4_crop_code.shp")
pol4_gdf = gpd.read_file(pol4_path)
# Insepct that pol1 has been read correctly by printing the type of data and reading the first few rows
print(type(pol1_gdf))
pol1_gdf.head()
<class 'geopandas.geodataframe.GeoDataFrame'>
country | region | year | season | crop_1 | aoi | crop1_code | geometry | |
---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Fergana | 2018 | summer | cotton | aoi_1 | 0 | MULTIPOLYGON (((71.18975 40.59049, 71.18977 40... |
1 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19309 40.58909, 71.19210 40... |
2 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19293 40.58684, 71.19218 40... |
3 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19390 40.58708, 71.19298 40... |
4 | Uzbekistan | Fergana | 2018 | summer | cotton | aoi_1 | 0 | MULTIPOLYGON (((71.17040 40.55147, 71.16873 40... |
# Insepct that pol2 has been read correctly by printing the type of data and the first few rows
print(type(pol2_gdf))
pol2_gdf.head()
<class 'geopandas.geodataframe.GeoDataFrame'>
country | region | year | season | crop_1 | aoi | crop1_code | geometry | |
---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Samarkand | 2016 | permanent | vineyard | aoi_2 | 6 | MULTIPOLYGON (((66.98073 39.10310, 66.98211 39... |
1 | Uzbekistan | Samarkand | 2016 | unclear | vegetables | aoi_2 | 7 | MULTIPOLYGON (((66.97909 39.10714, 66.97979 39... |
2 | Uzbekistan | Samarkand | 2016 | permanent | vineyard | aoi_2 | 6 | MULTIPOLYGON (((66.97806 39.10726, 66.97809 39... |
3 | Uzbekistan | Samarkand | 2016 | summer | maize | aoi_2 | 5 | MULTIPOLYGON (((66.98070 39.10780, 66.98098 39... |
4 | Uzbekistan | Samarkand | 2016 | winter | wheat | aoi_2 | 1 | MULTIPOLYGON (((66.97461 39.10863, 66.97308 39... |
# Insepct that pol3 has been read correctly by printing the type of data and the first few rows
print(type(pol3_gdf))
pol3_gdf.head()
<class 'geopandas.geodataframe.GeoDataFrame'>
country | region | year | season | crop_1 | aoi | crop1_code | geometry | |
---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.19614 39.19174, 65.19652 39.18742... |
1 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.19085 39.19327, 65.18710 39.19826... |
2 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.19075 39.18957, 65.19072 39.18759... |
3 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.18067 39.18739, 65.18743 39.18734... |
4 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.18304 39.18958, 65.17674 39.18661... |
# Insepct that pol4 has been read correctly by printing the type of data and the first few rows
print(type(pol4_gdf))
pol4_gdf.head()
<class 'geopandas.geodataframe.GeoDataFrame'>
country | region | year | season | crop_1 | aoi | crop1_code | geometry | |
---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Kashkadarya | 2018 | winter | wheat | aoi_4 | 1 | POLYGON ((65.30076 38.79538, 65.30125 38.79608... |
1 | Uzbekistan | Kashkadarya | 2018 | summer | cotton | aoi_4 | 0 | POLYGON ((65.29607 38.79463, 65.29717 38.79513... |
2 | Uzbekistan | Kashkadarya | 2018 | winter | wheat | aoi_4 | 1 | POLYGON ((65.28487 38.80078, 65.29384 38.80442... |
3 | Uzbekistan | Kashkadarya | 2018 | permanent | orchard | aoi_4 | 2 | POLYGON ((65.29744 38.79249, 65.29819 38.79281... |
4 | Uzbekistan | Kashkadarya | 2018 | permanent | orchard | aoi_4 | 2 | POLYGON ((65.29721 38.79514, 65.29837 38.79563... |
We read the four spatial data files containing cropping field boundaries into geodataframes, then checked the data type of each object using the type() function. This returned "<class 'geopandas.geodataframe.GeoDataFrame'>" for all objects, which confirmed that we have successfully read the data as geodataframes.
We also checked the structure of the tabular dataset of each geodataframe using the head() function. The head() function returned the first five rows of the tables of each object. Each table had columns storing non-spatial attributes and a Geoseries column storing geometric data for each row, giving spatial context to the non-spatial data. The presence of vector geospatial attributes confirmed that these datasets were Geopanda's geodataframes.
# Visualize pol1 in an interactive map
m1 = pol1_gdf.explore(column="crop_1", categorical=True, cmap = ["#232727 ", "#9EB400", "#A90BD3", "red", "green", "#CC8500 "], tiles = "OpenStreetMap")
m1
# Visualize pol2 in an interactive map
m2 = pol2_gdf.explore(column="crop_1", categorical=True, cmap = ["#260BD3", "brown", "#232727", "#9EB400", "red", "#04400B ","purple","#CC8500 "], tiles = "OpenStreetMap")
m2
# Visualize pol3 in an interactive map
m3 = pol3_gdf.explore(column="crop_1", categorical=True, cmap = ["#232727 ", "#CC8500 "], tiles = "OpenStreetMap")
m3
# Visualize pol4 in an interactive map
m4 = pol4_gdf.explore(column="crop_1", categorical=True, cmap = ["#260BD3", "#232727", "#D800CE ", "red", "purple", "#CC8500 "], tiles = "OpenStreetMap")
m4
The OpenStreetMap tiles were used to provide more context to the area by displaying physical entities such as roads and watercourse lines or conceptual entities like administrative boundaries. Qualitative color scales were used to represent crop types as they are categorical data. Darker colours were chosen for the crop types to provide sufficient contrast with the lighter base map. Similarly, crop colours were carefully chosen to ensure enough contrast between crop types, allowing viewers to distinguish different crop types easily. Crop colours were kept consistent across the 4 maps, making it easier for viewers to follow the crop attributes.
# Check the CRS of Pol1.
print(f"The crs of Polygon 1 is {pol1_gdf.crs}")
# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 1:")
pol1_gdf.groupby('crop_1').count().loc[:,"crop1_code"]
The crs of Polygon 1 is epsg:4326 The number of samples by crop type for Polygon 1:
crop_1 cotton 34 maize 1 onions 1 orchard 1 rice 2 wheat 35 Name: crop1_code, dtype: int64
# Check the CRS of Pol2.
print(f"The crs of Polygon 2 is {pol2_gdf.crs}")
# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 2:")
pol2_gdf.groupby('crop_1').count().loc[:,"crop1_code"]
The crs of Polygon 2 is epsg:4326 The number of samples by crop type for Polygon 2:
crop_1 alfalfa 22 beans 2 cotton 1 maize 9 orchard 15 vegetables 7 vineyard 26 wheat 23 Name: crop1_code, dtype: int64
# Check the CRS of Pol3.
print(f"The crs of Polygon 3 is {pol3_gdf.crs}")
# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 3:")
pol3_gdf.groupby('crop_1').count().loc[:,"crop1_code"]
The crs of Polygon 3 is epsg:4326 The number of samples by crop type for Polygon 3:
crop_1 cotton 64 wheat 55 Name: crop1_code, dtype: int64
# Check the CRS of Pol4.
print(f"The crs of Polygon 4 is {pol4_gdf.crs}")
# Explore the crop types present and the corresponding number of observations for each type
print("The number of samples by crop type for Polygon 4:")
pol4_gdf.groupby('crop_1').count().loc[:,"crop1_code"]
The crs of Polygon 4 is epsg:4326 The number of samples by crop type for Polygon 4:
crop_1 alfalfa 1 cotton 55 fallow 3 orchard 12 vineyard 1 wheat 75 Name: crop1_code, dtype: int64
# Create the smallest bounding box geometry around pol1
pol1_env = gpd.GeoSeries(pol1_gdf.geometry.unary_union).envelope
# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol1_env.explore(m=m1, color="red", style_kwds={"fillOpacity": 0})
# Set the CRS of pol1 bounding box to EPSG:4326
pol1_env = pol1_env.set_crs("EPSG:4326")
# Print the new CRS and check that the type of data is a GeoSeries object
print(pol1_env.crs)
print(type(pol1_env))
EPSG:4326 <class 'geopandas.geoseries.GeoSeries'>
# Create the smallest bounding box geometry around pol2
pol2_env = gpd.GeoSeries(pol2_gdf.geometry.unary_union).envelope
# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol2_env.explore(m=m2, color="red", style_kwds={"fillOpacity": 0})
# Set the CRS of pol2 bounding box to EPSG:4326
pol2_env = pol2_env.set_crs("EPSG:4326")
# Print the new CRS and check that the type of data is a GeoSeries object
print(pol2_env.crs)
print(type(pol2_env))
EPSG:4326 <class 'geopandas.geoseries.GeoSeries'>
# Create the smallest bounding box geometry around pol3
pol3_env = gpd.GeoSeries(pol3_gdf.geometry.unary_union).envelope
# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol3_env.explore(m=m3, color="red", style_kwds={"fillOpacity": 0})
# Set the CRS of pol1 bounding box to EPSG:4326
pol3_env = pol3_env.set_crs("EPSG:4326")
# Print the new CRS and check that the type of data is a GeoSeries object
print(pol3_env.crs)
print(type(pol3_env))
EPSG:4326 <class 'geopandas.geoseries.GeoSeries'>
# Create the smallest bounding box geometry around pol4
pol4_env = gpd.GeoSeries(pol4_gdf.geometry.unary_union).envelope
# Visualize the bounding box and the field polygons within with an interactive map to check that the envelope was generated correctly
pol4_env.explore(m=m4, color="red", style_kwds={"fillOpacity": 0})
# Set the CRS of pol1 bounding box to EPSG:4326
pol4_env = pol4_env.set_crs("EPSG:4326")
# Print the new CRS and check that the type of data is a GeoSeries object
print(pol4_env.crs)
print(type(pol4_env))
EPSG:4326 <class 'geopandas.geoseries.GeoSeries'>
We used the unary_union operation to combine the polygons into one before computing the envelope of each geodataframe. If we did not use the unary_union function, the envelope function would compute the envelope for every single polygon geometry of that geodataframe, resulting in many envelopes that do not cover the whole area of interest. When we use the unary_union function, all of the polygon geometries are combined into a single polygon geometry, allowing us to create an envelope that covers the whole area of interest using the envelope function.
# Transform the pol1 bounding box to EPSG:4326
pol1_env = pol1_env.to_crs("EPSG:4326")
# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m1e = pol1_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m1e
# Transform the pol2 bounding box to EPSG:4326
pol2_env = pol2_env.to_crs("EPSG:4326")
# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m2e = pol2_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m2e
# Transform the pol3 bounding box to EPSG:4326
pol3_env = pol3_env.to_crs("EPSG:4326")
# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m3e = pol3_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m3e
# Transform the pol4 bounding box to EPSG:4326
pol4_env = pol4_env.to_crs("EPSG:4326")
# Check that the bounding box has been transformed to EPSG:4326 by visualizing on the map. It should appear in the same location as the previous map with field polygons
m4e = pol4_env.explore(color="red", style_kwds={"fillOpacity": 0}, tiles = "OpenStreetMap")
m4e
We will create a Random Forest Classifier machine learning model to predict crop types using predictor variables from raster datasets, then evaluate the model using a range of metrics. The raster datasets will be sourced from Sentinel-2 , which is freely available on the public domain. Sentinel-2 uses algorithms that correct for scattering and absorbing effects of atmospheric particles, resulting in an accurate representation of the Earth surface. It has a high spatial resolution of up to 10m and a high revisit frequency of 5 days at the equator. Since most of our field polygons are longer than 30 meters, the raster datasets are suitable for our model. The high temporal resolution allows us to collect multi-date time-series data, which considers the different phenology of specific crop growth stage. The Sentinel-2 also captures red-edge spectral bands, which can enhance the ability to distinguish crop types and phenological states as they are highly sensitive to chlorophyll content .
A review of crop-type classification technologies shows that Normalized Difference Vegetation Index (NDVI) data have been applied intensively for predicting crop types and that they can achieve an accuracy of over 90%. For example, a study by Belgiu obtained an overall accuracy from 78% to 96% using NDVI series from Sentinel-2. The NDVI data has its own limitation as it relies on the difference in spectral signatures, which can be similar at specific phenological stages. Therefore, we will use cloud-free satellite images from May to October, which is the growing period for crops in Uzbekistan. By using data from the growing period, we can ensure that data from periods when there is no cropping activity are not used and that the changes in vegetation over time are taken into account. Studies have shown that models with red-edge bands have higher accuracy for classification, especially with vegetation data. Based on these evidence, we decided to use NDVI and red-edge bands from Sentinel-2, collected from May to October of 2018, as our predictors. We chose to use datasets from 2018 because that is the year field data were collected.
# Open a connection to the Microsoft Planetary Computer's root STAC catalog
pc_catalog = pystac_client.Client.open(
url="https://planetarycomputer.microsoft.com/api/stac/v1",)
# Download NDVI data for Polygon 1
# Get the bounding box as a shapely object
pol1_env_shapely = pol1_env[0]
# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr1 = []
# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol1_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)
# For the first month get the shape of the array
# For subsequent months resample all arrays to this shape
if i == 5:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol1_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
red = src.read(1, window=aoi_window)
# get affine transform for the windowed read
# this allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# create a copy of the metadata for saving later
out_meta1 = src.meta
# get the shape of the red ndarray
# force all other rasters to match this shape
out_shape1 = red.shape
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol1_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
nir = src.read(1, out_shape=out_shape1, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
else:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol1_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
red = src.read(1, out_shape=out_shape1, window=aoi_window)
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol1_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
nir = src.read(1, out_shape=out_shape1, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
# Append NDVI ndarray to list
ndvi_arr1.append(ndvi)
# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked1 = np.stack(ndvi_arr1, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked1.shape}")
# Inspect the metadata
print(out_meta1)
# Update meta for saving
out_meta1["dtype"] = "float32"
out_meta1["count"] = 6
out_meta1["height"] = ndvi_arr1[0].shape[0]
out_meta1["width"] = ndvi_arr1[0].shape[1]
out_meta1["transform"] = win_transform
print(out_meta1)
# Visualize data imported by an animated time-series diagram
figNDVI1 = px.imshow(
ndvi_stacked1,
animation_frame=0,
color_continuous_scale="viridis",
range_color=[0, 0.8],
height = 800,
width = 600)
figNDVI1.update_xaxes(showticklabels=False)
figNDVI1.update_yaxes(showticklabels=False)
figNDVI1.show()
processing month 5 processing month 6 processing month 7 processing month 8 processing month 9 processing month 10 the shape of ndvi_arr is (6, 487, 315) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 600000.0, 0.0, -10.0, 4500000.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 315, 'height': 487, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 683405.9436293562, 0.0, -10.0, 4496096.317958007)}
# Download Red-edge data for Polygon 1
# Get the bounding box as a shapely object
pol1_env_shapely = pol1_env[0]
# Create an empty list to store ndarray of monthly ndvi values
rededge_arr1 = []
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol1_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)
# For the first month get the shape of the array
# For subsequent months resample all arrays to this shape
if i == 5:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol1_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# Create a copy of the metadata for saving later
out_meta_rededge1 = src.meta
# Get the shape of the rededge ndarray
# Force all other rasters to match this shape
out_shape_rededge8A = rededge8A.shape
else:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol1_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
rededge8A = rededge8A.astype("float32")
# Append rededge ndarray to list
rededge_arr1.append(rededge8A)
# Stack the rededge ndarray's to create a multiband raster
rededge_stacked1 = np.stack(rededge_arr1, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked1.shape}")
# Inspect the metadata
print(out_meta_rededge1)
# Update meta for saving
out_meta_rededge1["dtype"] = "float32"
out_meta_rededge1["count"] = 6
out_meta_rededge1["height"] = rededge_arr1[0].shape[0]
out_meta_rededge1["width"] = rededge_arr1[0].shape[1]
out_meta_rededge1["transform"] = win_transform
print(out_meta_rededge1)
# Visualize data imported by an animated time-series diagram
figRE1 = px.imshow(
rededge_stacked1,
animation_frame=0,
color_continuous_scale="IceFire",
range_color=[0, 5000],
height = 800,
width = 600)
figRE1.update_xaxes(showticklabels=False)
figRE1.update_yaxes(showticklabels=False)
figRE1.show()
processing month 5 processing month 6 processing month 7 processing month 8 processing month 9 processing month 10 the shape of rededge_arr is (6, 244, 158) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 600000.0, 0.0, -20.0, 4500000.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 158, 'height': 244, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 683405.9436293562, 0.0, -20.0, 4496096.317958007)}
# Download NDVI data for Polygon 2
# Get the bounding box as a shapely object
pol2_env_shapely = pol2_env[0]
# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr2 = []
# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol2_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)
# For the first month get the shape of the array
# For subsequent months resample all arrays to this shape
if i == 5:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol2_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
red = src.read(1, window=aoi_window)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# Create a copy of the metadata for saving later
out_meta2 = src.meta
# Get the shape of the red ndarray
# Force all other rasters to match this shape
out_shape2 = red.shape
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol2_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
nir = src.read(1, out_shape=out_shape2, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
else:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol2_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
red = src.read(1, out_shape=out_shape2, window=aoi_window)
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol2_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
nir = src.read(1, out_shape=out_shape2, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
# Append NDVI ndarray to list
ndvi_arr2.append(ndvi)
# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked2 = np.stack(ndvi_arr2, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked2.shape}")
# Inspect the metadata
print(out_meta2)
# Update meta for saving
out_meta2["dtype"] = "float32"
out_meta2["count"] = 6
out_meta2["height"] = ndvi_arr2[0].shape[0]
out_meta2["width"] = ndvi_arr2[0].shape[1]
out_meta2["transform"] = win_transform
print(out_meta2)
# Visualize data imported by an animated time-series diagram
figNDVI2 = px.imshow(
ndvi_stacked2,
animation_frame=0,
color_continuous_scale="viridis",
range_color=[0, 0.8],
height = 800,
width = 600)
figNDVI2.update_xaxes(showticklabels=False)
figNDVI2.update_yaxes(showticklabels=False)
figNDVI2.show()
processing month 5 processing month 6 processing month 7 processing month 8 processing month 9 processing month 10 the shape of ndvi_arr is (6, 906, 1118) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 300000.0, 0.0, -10.0, 4400040.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 1118, 'height': 906, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(10.0, 0.0, 314518.683904971, 0.0, -10.0, 4338935.877525085)}
# Download Red-edge data for Polygon 2
# Get the bounding box as a shapely object
pol2_env_shapely = pol2_env[0]
# Create an empty list to store ndarray of monthly ndvi values
rededge_arr2 = []
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol2_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)
# For the first month get the shape of the array
# For subsequent months resample all arrays to this shape
if i == 5:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol2_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# Create a copy of the metadata for saving later
out_meta_rededge2 = src.meta
# Get the shape of the rededge ndarray
# Force all other rasters to match this shape
out_shape_rededge8A2 = rededge8A.shape
else:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol2_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A2)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
rededge8A = rededge8A.astype("float32")
# Append rededge ndarray to list
rededge_arr2.append(rededge8A)
# Stack the rededge ndarray's to create a multiband raster
rededge_stacked2 = np.stack(rededge_arr2, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked2.shape}")
# Inspect the metadata
print(out_meta_rededge2)
# Update meta for saving
out_meta_rededge2["dtype"] = "float32"
out_meta_rededge2["count"] = 6
out_meta_rededge2["height"] = rededge_arr2[0].shape[0]
out_meta_rededge2["width"] = rededge_arr2[0].shape[1]
out_meta_rededge2["transform"] = win_transform
print(out_meta_rededge2)
# Visualize data imported by an animated time-series diagram
figRE2 = px.imshow(
rededge_stacked2,
animation_frame=0, # here we are specifying axis=2 which is the bands (weeks) dimension.
color_continuous_scale="IceFire",
range_color=[0, 5000],
height = 800,
width = 600)
figRE2.update_xaxes(showticklabels=False)
figRE2.update_yaxes(showticklabels=False)
figRE2.show()
processing month 5 processing month 6 processing month 7 processing month 8 processing month 9 processing month 10 the shape of rededge_arr is (6, 453, 559) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 300000.0, 0.0, -20.0, 4400040.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 559, 'height': 453, 'count': 6, 'crs': CRS.from_epsg(32642), 'transform': Affine(20.0, 0.0, 314518.683904971, 0.0, -20.0, 4338935.877525085)}
# Download NDVI data for Polygon 3
# Get the bounding box as a shapely object
pol3_env_shapely = pol3_env[0]
# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr3 = []
# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol3_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)
# For the first month get the shape of the array
# For subsequent months resample all arrays to this shape
if i == 5:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol3_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
red = src.read(1, window=aoi_window)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# Create a copy of the metadata for saving later
out_meta3 = src.meta
# Get the shape of the red ndarray
# Force all other rasters to match this shape
out_shape3 = red.shape
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol3_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
nir = src.read(1, out_shape=out_shape3, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
else:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol3_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
red = src.read(1, out_shape=out_shape3, window=aoi_window)
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol3_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
nir = src.read(1, out_shape=out_shape3, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
# Append NDVI ndarray to list
ndvi_arr3.append(ndvi)
# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked3 = np.stack(ndvi_arr3, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked3.shape}")
# Inspect the metadata
print(out_meta3)
# Update meta for saving
out_meta3["dtype"] = "float32"
out_meta3["count"] = 6
out_meta3["height"] = ndvi_arr3[0].shape[0]
out_meta3["width"] = ndvi_arr3[0].shape[1]
out_meta3["transform"] = win_transform
print(out_meta3)
# Visualize data imported by an animated time-series diagram
figNDVI3 = px.imshow(
ndvi_stacked3,
animation_frame=0,
color_continuous_scale="viridis",
range_color=[0, 0.4],
height = 800,
width = 600)
figNDVI3.update_xaxes(showticklabels=False)
figNDVI3.update_yaxes(showticklabels=False)
figNDVI3.show()
processing month 5 processing month 6 processing month 7 processing month 8 processing month 9 processing month 10 the shape of ndvi_arr is (6, 837, 768) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 600000.0, 0.0, -10.0, 4400040.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 768, 'height': 837, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 682112.5818372199, 0.0, -10.0, 4344341.2613596255)}
# Download Red-edge data for Polygon 3
# Get the bounding box as a shapely object
pol3_env_shapely = pol3_env[0]
# Create an empty list to store ndarray of monthly ndvi values
rededge_arr3 = []
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol3_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)
# For the first month get the shape of the array
# For subsequent months resample all arrays to this shape
if i == 5:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol3_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# Create a copy of the metadata for saving later
out_meta_rededge3 = src.meta
# Get the shape of the rededge ndarray
# Force all other rasters to match this shape
out_shape_rededge8A3 = rededge8A.shape
else:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol3_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A3)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
rededge8A = rededge8A.astype("float32")
# Append rededge ndarray to list
rededge_arr3.append(rededge8A)
# Stack the rededge ndarray's to create a multiband raster
rededge_stacked3 = np.stack(rededge_arr3, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked3.shape}")
# Inspect the metadata
print(out_meta_rededge3)
# Update meta for saving
out_meta_rededge3["dtype"] = "float32"
out_meta_rededge3["count"] = 6
out_meta_rededge3["height"] = rededge_arr3[0].shape[0]
out_meta_rededge3["width"] = rededge_arr3[0].shape[1]
out_meta_rededge3["transform"] = win_transform
print(out_meta_rededge3)
# Visualize data imported by an animated time-series diagram
figRE3 = px.imshow(
rededge_stacked3,
animation_frame=0,
color_continuous_scale="IceFire",
range_color=[0, 5000],
height = 800,
width = 600)
figRE3.update_xaxes(showticklabels=False)
figRE3.update_yaxes(showticklabels=False)
figRE3.show()
processing month 5 processing month 6 processing month 7 processing month 8 processing month 9 processing month 10 the shape of rededge_arr is (6, 419, 384) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 600000.0, 0.0, -20.0, 4400040.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 384, 'height': 419, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 682112.5818372199, 0.0, -20.0, 4344341.2613596255)}
# Download NDVI data for Polygon 4
# Get the bounding box as a shapely object
pol4_env_shapely = pol4_env[0]
# Create an empty list to store ndarray of monthly ndvi values
ndvi_arr4 = []
# Pre-set time to May-October, 2018
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol4_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the red and NIR least cloudy Sentinel-2 assets
least_cloudy_red_href = pc.sign(least_cloudy.assets["B04"].href)
least_cloudy_nir08_href = pc.sign(least_cloudy.assets["B08"].href)
# For the first month get the shape of the array
# For subsequent months resample all arrays to this shape
if i == 5:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol4_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
red = src.read(1, window=aoi_window)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# Create a copy of the metadata for saving later
out_meta4 = src.meta
# Get the shape of the red ndarray
# Force all other rasters to match this shape
out_shape4 = red.shape
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol4_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
nir = src.read(1, out_shape=out_shape4, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
else:
# Read red band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_red_href) as src:
aoi_bounds = features.bounds(pol4_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
red = src.read(1, out_shape=out_shape4, window=aoi_window)
# Read nir band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_nir08_href) as src:
aoi_bounds = features.bounds(pol4_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
# Here we pass out_shape into read to define the shape of the array data is read into
nir = src.read(1, out_shape=out_shape4, window=aoi_window)
# Compute NDVI
# Cast to float as NDVI is bound between -1 and 1
nir = nir.astype("float32")
red = red.astype("float32")
ndvi = (nir - red) / (nir + red)
# Append NDVI ndarray to list
ndvi_arr4.append(ndvi)
# Stack the ndvi ndarray's to create a multiband raster
ndvi_stacked4 = np.stack(ndvi_arr4, axis=0)
print(f"the shape of ndvi_arr is {ndvi_stacked4.shape}")
# Inspect the metadata
print(out_meta4)
# Update meta for saving
out_meta4["dtype"] = "float32"
out_meta4["count"] = 6
out_meta4["height"] = ndvi_arr4[0].shape[0]
out_meta4["width"] = ndvi_arr4[0].shape[1]
out_meta4["transform"] = win_transform
print(out_meta4)
# Visualize data imported by an animated time-series diagram
figNDVI4 = px.imshow(
ndvi_stacked4,
animation_frame=0,
color_continuous_scale="viridis",
range_color=[0, 0.4],
height = 800,
width = 600)
figNDVI4.update_xaxes(showticklabels=False)
figNDVI4.update_yaxes(showticklabels=False)
figNDVI4.show()
processing month 5 processing month 6 processing month 7
/tmp/ipykernel_58/1739019700.py:121: RuntimeWarning: invalid value encountered in divide
processing month 8 processing month 9 processing month 10 the shape of ndvi_arr is (6, 874, 379) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 10980, 'height': 10980, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 699960.0, 0.0, -10.0, 4400040.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 379, 'height': 874, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(10.0, 0.0, 696976.8515412887, 0.0, -10.0, 4298978.419482002)}
# Download Red-edge data for Polygon 4
# Get the bounding box as a shapely object
pol4_env_shapely = pol4_env[0]
# Create an empty list to store ndarray of monthly ndvi values
rededge_arr4 = []
months = range(5, 11)
year = 2018
# Create a for loop that loops to download data for each month
for i in months:
print(f"processing month {i}")
# Convert numeric month indicators from single digit (1)
# to two digits (e.g. 01) to match the format of time query
if i < 10:
mnth = str(0) + str(i)
else:
mnth = str(i)
# Create month indicators for the start of the next month
# to constrain the search to one month at a time
if i < 9:
mnth_next = str(0) + str(i + 1)
else:
mnth_next = str(i + 1)
# Create a time of interest search for each month in turn
time_of_interest = f"{year}-{mnth}-01/{year}-{mnth_next}-01"
search = pc_catalog.search(
collections=["sentinel-2-l2a"],
intersects=pol4_env_shapely,
datetime=time_of_interest,
query={"eo:cloud_cover": {"lt": 10}})
# Define a variable for the item collection of Sentinel-2 assets that matched the search criteria
items = search.item_collection()
# Get item with the lowest cloud cover for each month
# Create an empty list to append the lowest cloud cover in the item collection to the list
cloud_cover = []
for cld in items:
cloud_cover.append(eo.ext(cld).cloud_cover)
# Find the index location in the list of the asset with lowest cloud cover
min_cloud_cover = min(cloud_cover)
min_cloud_cover_idx = cloud_cover.index(min_cloud_cover)
# Get the Sentinel-2 asset with the lowest cloud cover
least_cloudy = items[min_cloud_cover_idx]
# Get the signed URL for the Red-edge least cloudy Sentinel-2 assets
least_cloudy_rededge8A_href = pc.sign(least_cloudy.assets["B8A"].href)
if i == 5:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol4_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window)
# Get affine transform for the windowed read
# this allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
# Create a copy of the metadata for saving later
out_meta_rededge4 = src.meta
# Get the shape of the rededge ndarray
# Force all other rasters to match this shape
out_shape_rededge8A4 = rededge8A.shape
else:
# Read rededge8A band
# Open a connection to the COG using its signed link
with rasterio.open(least_cloudy_rededge8A_href) as src:
aoi_bounds = features.bounds(pol4_env_shapely)
warped_aoi_bounds = warp.transform_bounds("epsg:4326", src.crs, *aoi_bounds)
aoi_window = windows.from_bounds(transform=src.transform, *warped_aoi_bounds)
rededge8A = src.read(1, window=aoi_window, out_shape=out_shape_rededge8A4)
# Get affine transform for the windowed read
# This allows us to reproject the windowed read of data from COG file
src_transform = src.transform
win_transform = src.window_transform(aoi_window)
rededge8A = rededge8A.astype("float32")
# Append rededge ndarray to list
rededge_arr4.append(rededge8A)
# Stack the rededge ndarray's to create a multiband raster
rededge_stacked4 = np.stack(rededge_arr4, axis=0)
print(f"the shape of rededge_arr is {rededge_stacked4.shape}")
# Inspect the metadata
print(out_meta_rededge4)
# Update meta for saving
out_meta_rededge4["dtype"] = "float32"
out_meta_rededge4["count"] = 6
out_meta_rededge4["height"] = rededge_arr4[0].shape[0]
out_meta_rededge4["width"] = rededge_arr4[0].shape[1]
out_meta_rededge4["transform"] = win_transform
print(out_meta_rededge4)
# Visualize data imported by an animated time-series diagram
figRE4 = px.imshow(
rededge_stacked4,
animation_frame=0,
color_continuous_scale="IceFire",
range_color=[0, 5000],
height = 800,
width = 600)
figRE4.update_xaxes(showticklabels=False)
figRE4.update_yaxes(showticklabels=False)
figRE4.show()
processing month 5 processing month 6 processing month 7 processing month 8 processing month 9 processing month 10 the shape of rededge_arr is (6, 437, 189) {'driver': 'GTiff', 'dtype': 'uint16', 'nodata': None, 'width': 5490, 'height': 5490, 'count': 1, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 699960.0, 0.0, -20.0, 4400040.0)} {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 189, 'height': 437, 'count': 6, 'crs': CRS.from_epsg(32641), 'transform': Affine(20.0, 0.0, 696976.8515412887, 0.0, -20.0, 4298978.419482002)}
# Transform Pol1 NDVI and Red-edge into a tabular format
# Compute zonal statistics that summarize the NDVI values for Pol1
zstats_meta = out_meta1
zstats_meta["count"] = 1
# Create a copy of pol1_gdf to save zonal_stats results to
pol1_gdf_ndvi = pol1_gdf.copy()
# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr1:
print(f"computing ndvi zonal stats for month {month_idx}")
zstats = zonal_stats(
pol1_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol1_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Compute zonal statistics that summarize the Red-edge values for Pol1
zstats_meta = out_meta_rededge1
zstats_meta["count"] = 1
# Create a copy of pol1_gdf to save zonal_stats results to
pol1_gdf_rededge = pol1_gdf.copy()
# Create a For-loop that computes Red-edgge zonal statistics for each month
month_idx = 5
for i in rededge_arr1:
print(f"computing rededge zonal stats for month {month_idx}")
zstats = zonal_stats(
pol1_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol1_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol1_NDVI_RE_df = pd.merge(left=pol1_gdf_ndvi, right=pol1_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
, right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"])
# Display the first few rows
pol1_NDVI_RE_df.head()
computing ndvi zonal stats for month 5 computing ndvi zonal stats for month 6 computing ndvi zonal stats for month 7
/opt/conda/lib/python3.10/site-packages/rasterstats/io.py:328: NodataWarning: Setting nodata to -999; specify nodata explicitly
computing ndvi zonal stats for month 8 computing ndvi zonal stats for month 9 computing ndvi zonal stats for month 10 computing rededge zonal stats for month 5 computing rededge zonal stats for month 6 computing rededge zonal stats for month 7 computing rededge zonal stats for month 8 computing rededge zonal stats for month 9 computing rededge zonal stats for month 10
country | region | year | season | crop_1 | aoi | crop1_code | geometry | ndvi_5 | ndvi_6 | ndvi_7 | ndvi_8 | ndvi_9 | ndvi_10 | rededge_5 | rededge_6 | rededge_7 | rededge_8 | rededge_9 | rededge_10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Fergana | 2018 | summer | cotton | aoi_1 | 0 | MULTIPOLYGON (((71.18975 40.59049, 71.18977 40... | 0.106648 | 0.058165 | 0.487728 | 0.649186 | 0.379399 | 0.350379 | 2111.467836 | 2017.707602 | 3120.929825 | 3351.959064 | 2173.830409 | 2132.140351 |
1 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19309 40.58909, 71.19210 40... | 0.480385 | 0.464282 | 0.248883 | 0.517827 | 0.623533 | 0.554184 | 2596.467532 | 2192.253247 | 2360.298701 | 2547.110390 | 2883.032468 | 2709.240260 |
2 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19293 40.58684, 71.19218 40... | 0.562466 | 0.474759 | 0.096767 | 0.148958 | 0.376035 | 0.334232 | 2812.306122 | 2162.571429 | 2657.346939 | 2019.204082 | 2420.959184 | 2359.734694 |
3 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19390 40.58708, 71.19298 40... | 0.555527 | 0.467947 | 0.159104 | 0.327111 | 0.686653 | 0.565116 | 2813.687500 | 2137.265625 | 2037.906250 | 2397.437500 | 3153.304688 | 2953.085938 |
4 | Uzbekistan | Fergana | 2018 | summer | cotton | aoi_1 | 0 | MULTIPOLYGON (((71.17040 40.55147, 71.16873 40... | 0.113374 | 0.093075 | 0.718671 | 0.765948 | 0.417530 | 0.353729 | 2204.968421 | 2184.100000 | 4534.073684 | 4058.252632 | 2242.368421 | 2256.947368 |
# Transform Pol2 NDVI and Red-edge into a tabular format
# Compute zonal statistics that summarize the NDVI values for Pol2
zstats_meta = out_meta2
zstats_meta["count"] = 1
# Create a copy of pol2_gdf to save zonal_stats results to
pol2_gdf_ndvi = pol2_gdf.copy()
# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr2:
print(f"computing ndvi zonal stats for month {month_idx}")
zstats = zonal_stats(
pol2_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol2_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Compute zonal statistics that summarize the Red-edge values for Pol2
zstats_meta = out_meta_rededge2
zstats_meta["count"] = 1
# Create a copy of clum_aoi to save zonal_stats results to
pol2_gdf_rededge = pol2_gdf.copy()
# Create a For-loop that computes Red-edge zonal statistics for each month
month_idx = 5
for i in rededge_arr2:
print(f"computing rededge zonal stats for month {month_idx}")
zstats = zonal_stats(
pol2_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol2_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol2_NDVI_RE_df = pd.merge(left=pol2_gdf_ndvi, right=pol2_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
, right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"])
# Display on the first few rows
pol2_NDVI_RE_df.head()
computing ndvi zonal stats for month 5 computing ndvi zonal stats for month 6 computing ndvi zonal stats for month 7 computing ndvi zonal stats for month 8 computing ndvi zonal stats for month 9 computing ndvi zonal stats for month 10 computing rededge zonal stats for month 5 computing rededge zonal stats for month 6 computing rededge zonal stats for month 7 computing rededge zonal stats for month 8 computing rededge zonal stats for month 9 computing rededge zonal stats for month 10
country | region | year | season | crop_1 | aoi | crop1_code | geometry | ndvi_5 | ndvi_6 | ndvi_7 | ndvi_8 | ndvi_9 | ndvi_10 | rededge_5 | rededge_6 | rededge_7 | rededge_8 | rededge_9 | rededge_10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Samarkand | 2016 | permanent | vineyard | aoi_2 | 6 | MULTIPOLYGON (((66.98073 39.10310, 66.98211 39... | 0.616800 | 0.616145 | 0.590545 | 0.648879 | 0.539297 | 0.474883 | 4109.843750 | 4027.875000 | 3997.031250 | 3880.135417 | 3364.979167 | 3095.583333 |
1 | Uzbekistan | Samarkand | 2016 | unclear | vegetables | aoi_2 | 7 | MULTIPOLYGON (((66.97909 39.10714, 66.97979 39... | 0.748144 | 0.541868 | 0.463725 | 0.557448 | 0.545439 | 0.434635 | 4669.937500 | 4002.375000 | 3571.812500 | 4083.312500 | 3447.812500 | 3212.187500 |
2 | Uzbekistan | Samarkand | 2016 | permanent | vineyard | aoi_2 | 6 | MULTIPOLYGON (((66.97806 39.10726, 66.97809 39... | 0.543793 | 0.591070 | 0.531800 | 0.591057 | 0.555157 | 0.434555 | 3846.788462 | 3834.692308 | 3762.076923 | 3600.009615 | 3303.230769 | 2993.182692 |
3 | Uzbekistan | Samarkand | 2016 | summer | maize | aoi_2 | 5 | MULTIPOLYGON (((66.98070 39.10780, 66.98098 39... | 0.791644 | 0.348566 | 0.283063 | 0.206600 | 0.195828 | 0.156752 | 3883.125000 | 3146.000000 | 3264.750000 | 3496.875000 | 2841.750000 | 2042.500000 |
4 | Uzbekistan | Samarkand | 2016 | winter | wheat | aoi_2 | 1 | MULTIPOLYGON (((66.97461 39.10863, 66.97308 39... | 0.675362 | 0.365412 | 0.420331 | 0.487360 | 0.336148 | 0.260112 | 4174.475000 | 3680.950000 | 3729.675000 | 3785.725000 | 3306.275000 | 3060.975000 |
# Transform Pol3 NDVI and Red-edge into a tabular format
# Compute zonal statistics that summarize the NDVI values for Pol3
zstats_meta = out_meta3
zstats_meta["count"] = 1
# Create a copy of pol3_gdf to save zonal_stats results to
pol3_gdf_ndvi = pol3_gdf.copy()
# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr3:
print(f"computing ndvi zonal stats for month {month_idx}")
zstats = zonal_stats(
pol3_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol3_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Compute zonal statistics that summarize the NDVI values for Pol3
zstats_meta = out_meta_rededge3
zstats_meta["count"] = 1
# Create a copy of pol3_gdf to save zonal_stats results to
pol3_gdf_rededge = pol3_gdf.copy()
# Create a For-loop that computes Red-edge zonal statistics for each month
month_idx = 5
for i in rededge_arr3:
print(f"computing rededge zonal stats for month {month_idx}")
zstats = zonal_stats(
pol3_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol3_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol3_NDVI_RE_df = pd.merge(left=pol3_gdf_ndvi, right=pol3_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
, right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"])
# Display on the first few rows
pol3_NDVI_RE_df.head()
computing ndvi zonal stats for month 5 computing ndvi zonal stats for month 6 computing ndvi zonal stats for month 7 computing ndvi zonal stats for month 8 computing ndvi zonal stats for month 9 computing ndvi zonal stats for month 10 computing rededge zonal stats for month 5 computing rededge zonal stats for month 6 computing rededge zonal stats for month 7 computing rededge zonal stats for month 8 computing rededge zonal stats for month 9 computing rededge zonal stats for month 10
country | region | year | season | crop_1 | aoi | crop1_code | geometry | ndvi_5 | ndvi_6 | ndvi_7 | ndvi_8 | ndvi_9 | ndvi_10 | rededge_5 | rededge_6 | rededge_7 | rededge_8 | rededge_9 | rededge_10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.19614 39.19174, 65.19652 39.18742... | 0.651894 | 0.207635 | 0.195904 | 0.182068 | 0.199424 | 0.124098 | 3142.551669 | 4190.310016 | 4259.621622 | 4501.093800 | 4347.235294 | 2561.445151 |
1 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.19085 39.19327, 65.18710 39.19826... | 0.108015 | 0.138196 | 0.205911 | 0.483191 | 0.478064 | 0.228358 | 3197.067308 | 3102.408654 | 3142.394231 | 3549.528846 | 3387.072115 | 2796.418269 |
2 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.19075 39.18957, 65.19072 39.18759... | 0.186981 | 0.172990 | 0.211738 | 0.490073 | 0.518232 | 0.251647 | 3034.577626 | 3196.744292 | 3603.045662 | 3826.844749 | 3703.933790 | 3273.424658 |
3 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.18067 39.18739, 65.18743 39.18734... | 0.174689 | 0.209438 | 0.244037 | 0.476990 | 0.510088 | 0.309258 | 3365.637555 | 3293.724891 | 3374.043668 | 3589.860262 | 3298.663755 | 2919.606987 |
4 | Uzbekistan | Samarkand | 2016 | summer | cotton | aoi_3 | 0 | POLYGON ((65.18304 39.18958, 65.17674 39.18661... | 0.483318 | 0.245942 | 0.170634 | 0.122645 | 0.135018 | 0.094399 | 3451.836483 | 4281.904684 | 4405.020542 | 4188.260477 | 3437.099425 | 2390.975349 |
# Transform Pol4 NDVI and Red-edge into a tabular format
# Compute zonal statistics that summarize the NDVI values for Pol4
zstats_meta = out_meta4
zstats_meta["count"] = 1
# Create a copy of pol4_gdf to save zonal_stats results to
pol4_gdf_ndvi = pol4_gdf.copy()
# Create a For-loop that computes NDVI zonal statistics for each month
month_idx = 5
for i in ndvi_arr4:
print(f"computing ndvi zonal stats for month {month_idx}")
zstats = zonal_stats(
pol4_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol4_gdf_ndvi[f"ndvi_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Compute zonal statistics that summarize the NDVI values for Pol4
zstats_meta = out_meta_rededge4
zstats_meta["count"] = 1
# Create a copy of pol4_gdf to save zonal_stats results to
pol4_gdf_rededge = pol4_gdf.copy()
# Create a For-loop that computes Red-edge zonal statistics for each month
month_idx = 5
for i in rededge_arr4:
print(f"computing rededge zonal stats for month {month_idx}")
zstats = zonal_stats(
pol4_gdf.to_crs(zstats_meta["crs"]),
i,
affine=zstats_meta["transform"],
stats=["mean"],
all_touched=True
)
df_zstats = pd.DataFrame(zstats)
pol4_gdf_rededge[f"rededge_{month_idx}"] = df_zstats.iloc[:, 0]
month_idx += 1
# Combine the computed zonal statistics of NDVI and Rededge into one dataframe
pol4_NDVI_RE_df = pd.merge(left=pol4_gdf_ndvi, right=pol4_gdf_rededge, how="inner", left_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"]
, right_on=["country","region","year","season","crop_1","aoi","crop1_code","geometry"])
# Display on the first few rows
pol4_NDVI_RE_df.head()
computing ndvi zonal stats for month 5 computing ndvi zonal stats for month 6 computing ndvi zonal stats for month 7 computing ndvi zonal stats for month 8 computing ndvi zonal stats for month 9 computing ndvi zonal stats for month 10 computing rededge zonal stats for month 5 computing rededge zonal stats for month 6 computing rededge zonal stats for month 7 computing rededge zonal stats for month 8 computing rededge zonal stats for month 9 computing rededge zonal stats for month 10
country | region | year | season | crop_1 | aoi | crop1_code | geometry | ndvi_5 | ndvi_6 | ndvi_7 | ndvi_8 | ndvi_9 | ndvi_10 | rededge_5 | rededge_6 | rededge_7 | rededge_8 | rededge_9 | rededge_10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Kashkadarya | 2018 | winter | wheat | aoi_4 | 1 | POLYGON ((65.30076 38.79538, 65.30125 38.79608... | 0.110825 | 0.121411 | 0.299068 | 0.585074 | 0.741629 | 0.549816 | 2900.876404 | 2901.741573 | 3543.550562 | 4159.280899 | 4146.483146 | 2850.898876 |
1 | Uzbekistan | Kashkadarya | 2018 | summer | cotton | aoi_4 | 0 | POLYGON ((65.29607 38.79463, 65.29717 38.79513... | 0.521030 | 0.345005 | 0.253342 | 0.240403 | 0.235322 | 0.190898 | 2970.420455 | 3903.886364 | 3709.397727 | 3917.454545 | 3781.590909 | 3302.147727 |
2 | Uzbekistan | Kashkadarya | 2018 | winter | wheat | aoi_4 | 1 | POLYGON ((65.28487 38.80078, 65.29384 38.80442... | 0.301086 | 0.130437 | 0.239223 | 0.386766 | 0.454388 | 0.309634 | 3078.240000 | 3209.526667 | 3597.360000 | 3783.020000 | 3730.333333 | 2761.180000 |
3 | Uzbekistan | Kashkadarya | 2018 | permanent | orchard | aoi_4 | 2 | POLYGON ((65.29744 38.79249, 65.29819 38.79281... | 0.339116 | 0.255328 | 0.133188 | 0.177755 | 0.448556 | 0.323423 | 3031.396552 | 3594.017241 | 2920.448276 | 3531.224138 | 3543.741379 | 2932.327586 |
4 | Uzbekistan | Kashkadarya | 2018 | permanent | orchard | aoi_4 | 2 | POLYGON ((65.29721 38.79514, 65.29837 38.79563... | 0.341406 | 0.263521 | 0.075036 | 0.142118 | 0.516263 | 0.369137 | 3088.340426 | 3635.436170 | 2219.872340 | 3255.202128 | 3427.191489 | 2755.648936 |
# Combine NDVI and Red-edge data from all polygons into one dataframe
# Create a list storing the dataframe for each polygon
gdf_list = [pol1_NDVI_RE_df, pol2_NDVI_RE_df,pol3_NDVI_RE_df,pol4_NDVI_RE_df]
# Concatenate each dataframe on to each other and store in AllPol dataframe
AllPol_gdf = pd.concat(gdf_list)
# Check the type of AllPol_gdf
print(type(AllPol_gdf))
# Check the shape of AllPol_gdf
print(AllPol_gdf.shape)
# Print the first few columns of AllPol_gdf
AllPol_gdf.head()
<class 'geopandas.geodataframe.GeoDataFrame'> (445, 20)
country | region | year | season | crop_1 | aoi | crop1_code | geometry | ndvi_5 | ndvi_6 | ndvi_7 | ndvi_8 | ndvi_9 | ndvi_10 | rededge_5 | rededge_6 | rededge_7 | rededge_8 | rededge_9 | rededge_10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Uzbekistan | Fergana | 2018 | summer | cotton | aoi_1 | 0 | MULTIPOLYGON (((71.18975 40.59049, 71.18977 40... | 0.106648 | 0.058165 | 0.487728 | 0.649186 | 0.379399 | 0.350379 | 2111.467836 | 2017.707602 | 3120.929825 | 3351.959064 | 2173.830409 | 2132.140351 |
1 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19309 40.58909, 71.19210 40... | 0.480385 | 0.464282 | 0.248883 | 0.517827 | 0.623533 | 0.554184 | 2596.467532 | 2192.253247 | 2360.298701 | 2547.110390 | 2883.032468 | 2709.240260 |
2 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19293 40.58684, 71.19218 40... | 0.562466 | 0.474759 | 0.096767 | 0.148958 | 0.376035 | 0.334232 | 2812.306122 | 2162.571429 | 2657.346939 | 2019.204082 | 2420.959184 | 2359.734694 |
3 | Uzbekistan | Fergana | 2018 | double | wheat | aoi_1 | 1 | MULTIPOLYGON (((71.19390 40.58708, 71.19298 40... | 0.555527 | 0.467947 | 0.159104 | 0.327111 | 0.686653 | 0.565116 | 2813.687500 | 2137.265625 | 2037.906250 | 2397.437500 | 3153.304688 | 2953.085938 |
4 | Uzbekistan | Fergana | 2018 | summer | cotton | aoi_1 | 0 | MULTIPOLYGON (((71.17040 40.55147, 71.16873 40... | 0.113374 | 0.093075 | 0.718671 | 0.765948 | 0.417530 | 0.353729 | 2204.968421 | 2184.100000 | 4534.073684 | 4058.252632 | 2242.368421 | 2256.947368 |
# Check how many rows with NaN values we have
nan_counts = AllPol_gdf.isnull().sum()
print(nan_counts)
# We have 33 rows with NaN
country 0 region 0 year 0 season 0 crop_1 0 aoi 0 crop1_code 0 geometry 0 ndvi_5 33 ndvi_6 33 ndvi_7 33 ndvi_8 33 ndvi_9 33 ndvi_10 33 rededge_5 0 rededge_6 33 rededge_7 33 rededge_8 33 rededge_9 33 rededge_10 33 dtype: int64
# Remove all rows with NaN
print("Shape of DataFrame before dropping zero yield values:")
print(AllPol_gdf.shape)
AllPol_gdf_dropped_NaN = AllPol_gdf.dropna()
print("Shape of DataFrame after dropping zero yield values:")
print(AllPol_gdf_dropped_NaN.shape)
# Number of rows decreased by 33, confirming we have removed all the rows with NaN values
Shape of DataFrame before dropping zero yield values: (445, 20) Shape of DataFrame after dropping zero yield values: (412, 20)
# Check how many rows with outliers do we have
# Create a list of predictors to pass into the For-loop
predictors = ["ndvi_5","ndvi_6","ndvi_7","ndvi_8","ndvi_9","ndvi_10","rededge_5","rededge_6","rededge_7","rededge_8","rededge_9","rededge_10"]
# Generate a For-loop that counts the number of rows that have outliers for each predictor
for i in predictors:
print(f"The number of rows with outliers in {i}")
summary = AllPol_gdf_dropped_NaN[{i}].describe()
mean = summary.loc['mean']
std = summary.loc['std']
threshold = 3 * std
# Define outliers
outliers = (AllPol_gdf_dropped_NaN[[i]] > (mean + threshold)) | (AllPol_gdf_dropped_NaN[[i]] < (mean - threshold))
# Compute the total number of rows with outliers
rows_with_outliers = outliers.any(axis=1).sum()
print(rows_with_outliers)
The number of rows with outliers in ndvi_5 0 The number of rows with outliers in ndvi_6 2 The number of rows with outliers in ndvi_7 1 The number of rows with outliers in ndvi_8 0 The number of rows with outliers in ndvi_9 0 The number of rows with outliers in ndvi_10 1 The number of rows with outliers in rededge_5 12 The number of rows with outliers in rededge_6 0 The number of rows with outliers in rededge_7 2 The number of rows with outliers in rededge_8 2 The number of rows with outliers in rededge_9 1 The number of rows with outliers in rededge_10 2
/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
# Remove outliers by crop type
# Create a list of predictors to pass into the For-loop
predictors = ["ndvi_5","ndvi_6","ndvi_7","ndvi_8","ndvi_9","ndvi_10","rededge_5","rededge_6","rededge_7","rededge_8","rededge_9","rededge_10"]
# Calculate the mean of each column
for i in predictors:
print(AllPol_gdf_dropped_NaN[i].mean())
# Store mean values in a list
mean_columns = [0.37863035405212636,
0.29362544445278704,
0.30908943074915174,
0.3761947765611446,
0.39587637956960864,
0.31138927726383514,
4687.018390525821,
3417.233096856336,
3552.2705976999932,
3578.5544779844145,
3407.8627182745977,
2903.206591342297]
# Calculate the standard deviation of each column
for i in predictors:
print(AllPol_gdf_dropped_NaN[i].std())
#Store standard deviation values in a list
sd_columns = [0.20280309247659548,
0.1680853065268984,
0.1559785137138195,
0.17190269643184122,
0.16819120363708812,
0.1533305540472219,
8063.2830106455,
678.5587102694418,
548.1344136832438,
502.9737064976784,
507.99505789788094,
420.736125187469]
0.37863035405212636 0.29362544445278704 0.30908943074915174 0.3761947765611446 0.39587637956960864 0.31138927726383514 4687.018390525821 3417.233096856336 3552.2705976999932 3578.5544779844145 3407.8627182745977 2903.206591342297 0.2028030924765954 0.16808530652689838 0.15597851371381952 0.1719026964318412 0.168191203637088 0.15333055404722185 8063.283010645502 678.5587102694416 548.1344136832438 502.97370649767817 507.99505789788066 420.73612518746904
# Set index as 0 to subset the position of mean columns and standard deviation columns throughout the loop
idx = 0
def remove_outliers(mean_sd_columns, k, threshold=3):
mean = mean_columns[idx]
std = sd_columns[idx]
lower_bound = mean - (threshold * std)
upper_bound = mean + (threshold * std)
return mean_sd_columns[(mean_sd_columns[k] >= lower_bound) & (mean_sd_columns[k] <= upper_bound)]
filtered_gdf = AllPol_gdf_dropped_NaN
for k in predictors:
# Generate a For-loop that removes all the rows with outliers in each predictor column
filtered_gdf = remove_outliers(filtered_gdf, k)
filtered_gdf.reset_index(drop=True, inplace=True)
print(filtered_gdf.shape)
# Increment index by 1 to subset the next mean and standard values in the lists
idx = idx + 1
(412, 20) (410, 20) (409, 20) (409, 20) (409, 20) (408, 20) (396, 20) (396, 20) (395, 20) (394, 20) (393, 20) (393, 20)
We counted the total number of rows with Nan values using a combination of isnull() and sum() , so that we can check if the number of rows to be removed is correct. Rows with NaN values were then removed using the dropna() function.
In this model, outliers are defined as any value outside the range of three standard deviations from the mean. We counted the rows with outliers for each column, so that we check if the number of rows to be removed is correct. However, one row can have multiple outliers in multiple columns; therefore, as a row gets removed, the next column may have one fewer row with outliers to be removed. This means that the numbers we use to check may not match with the number of rows that actually get removed for each column. However, it does provide an estimation of the approximate number of rows expected to be removed, for example, we know that the column red-edge 5 has 12 outliers and we saw a drop in the number of rows from 408 to 396 when removing outliers from red-edge 5, giving an indication that we successfuly removed the outliers.
The outliers were removed by columns using a function remove_outliers(), which we manually defined to remove any values that are lower than the lower bound set at 3 standard deviations below the mean and higher than the upper bound set at 3 standard deviations above the mean.
# Create a boxplot to determine the number of observations we have for each crop type
Crop_count = px.histogram(filtered_gdf, x="crop_1")
Crop_count.update_yaxes(title_text="Number of observations")
Crop_count.update_xaxes(title_text="Crop type")
Crop_count.show()
print(filtered_gdf.shape)
# From the boxplot, we found that the number of observations for rice, beans, maize, vegetables and fallows is insufficient to train a proper classification model; therefore, we dropped these crop types
condition_rice = filtered_gdf["crop_1"] == "rice"
condition_beans = filtered_gdf["crop_1"] == "beans"
condition_fallow= filtered_gdf["crop_1"] == "fallow"
condition_maize= filtered_gdf["crop_1"] == "maize"
condition_vegetables= filtered_gdf["crop_1"] == "vegetables"
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_rice].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_beans].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_fallow].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_maize].index)
filtered_gdf = filtered_gdf.drop(filtered_gdf[condition_vegetables].index)
# Print the shape to check that 24 rows representing rice, beans, maize, vegetables, and allows were dropped
print(filtered_gdf.shape)
(393, 20) (369, 20)
/opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning: Boolean Series key will be reindexed to match DataFrame index. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning: Boolean Series key will be reindexed to match DataFrame index. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning: Boolean Series key will be reindexed to match DataFrame index. /opt/conda/lib/python3.10/site-packages/geopandas/geodataframe.py:1428: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
# Create a new geodataframe that summarizes the average values of NDVI and Red-edge for each crop type
filtered_gdf_groups= filtered_gdf.groupby(["crop_1"])
filtered_gdf_groups.size()
filtered_gdf_groups.mean()
crop_averages = filtered_gdf_groups.mean()
crop_averages
/tmp/ipykernel_58/2790574965.py:5: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. /tmp/ipykernel_58/2790574965.py:6: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
crop1_code | ndvi_5 | ndvi_6 | ndvi_7 | ndvi_8 | ndvi_9 | ndvi_10 | rededge_5 | rededge_6 | rededge_7 | rededge_8 | rededge_9 | rededge_10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
crop_1 | |||||||||||||
alfalfa | 8.0 | 0.569191 | 0.421874 | 0.403822 | 0.399101 | 0.435692 | 0.379882 | 3801.269029 | 3799.404979 | 3856.133407 | 3760.713435 | 3645.027031 | 3265.225652 |
cotton | 0.0 | 0.259064 | 0.189474 | 0.299315 | 0.382988 | 0.356697 | 0.264313 | 3132.246634 | 3267.360390 | 3600.293509 | 3592.284702 | 3319.144499 | 2790.055704 |
orchard | 2.0 | 0.510183 | 0.411830 | 0.414758 | 0.469672 | 0.524347 | 0.416552 | 3620.155814 | 3626.364470 | 3550.457153 | 3658.882003 | 3564.856285 | 2988.295056 |
vineyard | 6.0 | 0.527496 | 0.495214 | 0.470471 | 0.488136 | 0.468839 | 0.421426 | 3862.713535 | 3828.018298 | 3792.799446 | 3699.929657 | 3425.881815 | 3063.786412 |
wheat | 1.0 | 0.381277 | 0.297124 | 0.250301 | 0.334811 | 0.402461 | 0.308856 | 3339.121626 | 3370.310844 | 3406.661813 | 3490.657762 | 3439.219007 | 2903.148407 |
# Transpose the geodataframe to enable visualization in line graphs
crop_averages_Transposed= crop_averages.T
crop_averages_T_NDVI = crop_averages_Transposed.copy()
# Drop the red-edge rows to visualize NDVI data
crop_averages_NDVI_ready = crop_averages_T_NDVI.drop(["crop1_code","rededge_5","rededge_6","rededge_7","rededge_8","rededge_9","rededge_10"],axis=0)
crop_averages_NDVI_ready
crop_1 | alfalfa | cotton | orchard | vineyard | wheat |
---|---|---|---|---|---|
ndvi_5 | 0.569191 | 0.259064 | 0.510183 | 0.527496 | 0.381277 |
ndvi_6 | 0.421874 | 0.189474 | 0.411830 | 0.495214 | 0.297124 |
ndvi_7 | 0.403822 | 0.299315 | 0.414758 | 0.470471 | 0.250301 |
ndvi_8 | 0.399101 | 0.382988 | 0.469672 | 0.488136 | 0.334811 |
ndvi_9 | 0.435692 | 0.356697 | 0.524347 | 0.468839 | 0.402461 |
ndvi_10 | 0.379882 | 0.264313 | 0.416552 | 0.421426 | 0.308856 |
# Visualize the average NDVI data over time for each crop
# Create a list of crop names to pass into the For-loop
crop = ["alfalfa","cotton", "orchard","vineyard", "wheat"]
# Generate a For-loop that creates a line graph for each crop type displaying average NDVI data over time
for i in crop:
i = px.line(
data_frame=crop_averages_NDVI_ready,
x=["May", "June", "July", "August", "September", "October"],
y=f"{i}",
title = f"NDVI of <b>{i}</b> crop from May to October, 2018 in Central Asia")
i.update_traces(line_color='#009b00', line_width=5)
i.update_yaxes(title_text="NDVI")
i.update_xaxes(title_text="Month")
i.show()
# Transpose the geodataframe to allow visualization in line graphs
crop_averages_Transposed= crop_averages.T
crop_averages_T_Rededge = crop_averages_Transposed.copy()
# Drop the NDVI rows to visualize Red-edge data
crop_averages_Rededge_ready = crop_averages_T_Rededge.drop(["crop1_code","ndvi_5","ndvi_6","ndvi_7","ndvi_8","ndvi_9","ndvi_10"],axis=0)
crop_averages_Rededge_ready
crop_1 | alfalfa | cotton | orchard | vineyard | wheat |
---|---|---|---|---|---|
rededge_5 | 3801.269029 | 3132.246634 | 3620.155814 | 3862.713535 | 3339.121626 |
rededge_6 | 3799.404979 | 3267.360390 | 3626.364470 | 3828.018298 | 3370.310844 |
rededge_7 | 3856.133407 | 3600.293509 | 3550.457153 | 3792.799446 | 3406.661813 |
rededge_8 | 3760.713435 | 3592.284702 | 3658.882003 | 3699.929657 | 3490.657762 |
rededge_9 | 3645.027031 | 3319.144499 | 3564.856285 | 3425.881815 | 3439.219007 |
rededge_10 | 3265.225652 | 2790.055704 | 2988.295056 | 3063.786412 | 2903.148407 |
# Visualize the average Red-edge data over time for each crop
# Create a list of crop names to pass into the For-loop
crop = ["alfalfa","cotton", "orchard","vineyard", "wheat"]
# Generate a For-loop that creates a line graph for each crop type displaying average Red-edge data over time
for y in crop:
y = px.line(
data_frame=crop_averages_Rededge_ready,
x=["May", "June", "July", "August", "September", "October"],
y=f"{y}",
title = f"Rededge of <b>{y}</b> crop from May to October, 2018 in Central Asia")
y.update_traces(line_color='#FF0000', line_width=5)
y.update_yaxes(title_text="Rededge")
y.update_xaxes(title_text="Month")
y.show()
# Pool vineyard, orchard and alfalfa together by changing their crop type label to "others"
filtered_gdf["crop_1"] = filtered_gdf["crop_1"].replace("vineyard", 'others')
filtered_gdf["crop_1"] = filtered_gdf["crop_1"].replace("orchard", 'others')
filtered_gdf["crop_1"] = filtered_gdf["crop_1"].replace("alfalfa", 'others')
# check the new crop types. There should be 3 crop types: wheat, cotton, others
filtered_gdf["crop_1"].value_counts()
wheat 165 cotton 134 others 70 Name: crop_1, dtype: int64
We created a boxplot to illustrate the number of observations for each crop type. This allowed us to remove the crop types, including rice, fallow, maize, vegetables and beans, that have too few observations for the model to learn the pattern.
We visusalized the NDVI and red-edge data for each crop over the 6-month period chosen using line graphs because they are adequate for visualizing continuous data in a time-series. Because the machine-learning model relies on the NDVI/red-edge data to be different for each crop type, it is useful to know how different or similar each crop type is to one another. For example, if we see that the machine-learning model is mistaking wheat for cotton, we may be able to explain from the graphs that this inaccuracy is due to the NDVI/rededge data of the two crops being similar. The NDVI and red-edge data were displayed in green and red, respectively, to clearly distinguish the difference between the graphs. The name of the crop is in bold, so that viewers can quickly identify the crop being represented by the graph.
With 50 observations, a machine learning model can often develop "a feeling" for the data structure. Therefore, we grouped vineyard, orchard and alfalfa together as "others", which added up to 70 samples. From the line graphs, all three crops have similar values in both NDVI and red-edge, so it is unlikely that their patterns will conflict significantly. By pooling the minority classes, we can alleviate the class imbalance issue.
# Explore our NDVI and Red-edge data for each field polygon
filtered_gdf.explore("crop_1", tiles="CartoDB dark_matter", cmap="tab20", categorical=True)
# Check that there are enough observations in each polygon (area of interest)
filtered_gdf.groupby("aoi").count().loc[:,"crop1_code"]
aoi aoi_1 68 aoi_2 83 aoi_3 119 aoi_4 99 Name: crop1_code, dtype: int64
# Subset the geodataframe to create our x object, which should follow the format (samples, predictors)
x_sp = filtered_gdf.drop(["country", "region", "year", "season", "crop_1", "aoi", "crop1_code","geometry"], axis=1)
# Subset the geodataframe to create our y object, which should have only the ground-truth outcomes
y_sp = filtered_gdf.loc[:, "crop_1"]
# check the structure and shape of x
print(x_sp.head())
print(x_sp.shape)
# check the structure and shape of y
print(y_sp.head())
print(y_sp.shape)
ndvi_5 ndvi_6 ndvi_7 ndvi_8 ndvi_9 ndvi_10 rededge_5 \ 0 0.106648 0.058165 0.487728 0.649186 0.379399 0.350379 2111.467836 1 0.480385 0.464282 0.248883 0.517827 0.623533 0.554184 2596.467532 2 0.555527 0.467947 0.159104 0.327111 0.686653 0.565116 2813.687500 3 0.113374 0.093075 0.718671 0.765948 0.417530 0.353729 2204.968421 4 0.142455 0.103555 0.623383 0.732566 0.401693 0.330844 2286.989362 rededge_6 rededge_7 rededge_8 rededge_9 rededge_10 0 2017.707602 3120.929825 3351.959064 2173.830409 2132.140351 1 2192.253247 2360.298701 2547.110390 2883.032468 2709.240260 2 2137.265625 2037.906250 2397.437500 3153.304688 2953.085938 3 2184.100000 4534.073684 4058.252632 2242.368421 2256.947368 4 2224.404255 4218.494681 3734.882979 2268.154255 2295.696809 (369, 12) 0 cotton 1 wheat 2 wheat 3 cotton 4 cotton Name: crop_1, dtype: object (369,)
# To prevent data leakage, we will ensure that training and test samples will not have samples from the same area.
groups = filtered_gdf.loc[:, "aoi"]
# Use GroupShuffleSplit function to obtain 4 array-like objects: x_train_sp, x_test_sp, y_train_sp, y_test_sp
gss = GroupShuffleSplit(n_splits=1, test_size=.5, random_state=6)
for i, (train_index, test_index) in enumerate(gss.split(x_sp, y_sp, groups)):
print(f"processing split {i}")
x_train_sp = x_sp.iloc[train_index, :]
x_test_sp = x_sp.iloc[test_index, :]
y_train_sp = y_sp.iloc[train_index]
y_test_sp = y_sp.iloc[test_index]
# Inspect the shape of each object
print(f"the size of the training features object is {x_train_sp.shape}")
print(f"the size of the test features object is {x_test_sp.shape}")
print(f"the size of the training outcomes object is {y_train_sp.shape}")
print(f"the size of the test outcomes object is {y_test_sp.shape}")
processing split 0 the size of the training features object is (202, 12) the size of the test features object is (167, 12) the size of the training outcomes object is (202,) the size of the test outcomes object is (167,)
# Check that both the training and test sets have all three types of crops
print("The training set has the following crop types:")
print(y_train_sp.value_counts())
print("")
print("The test set has the following crop types:")
print(y_test_sp.value_counts())
The training set has the following crop types: wheat 78 cotton 65 others 59 Name: crop_1, dtype: int64 The test set has the following crop types: wheat 87 cotton 69 others 11 Name: crop_1, dtype: int64
# Create and train a random forests model
rf_sp = RandomForestClassifier(n_estimators=200, random_state=6)
rf_sp.fit(x_train_sp, y_train_sp.astype(str))
RandomForestClassifier(n_estimators=200, random_state=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(n_estimators=200, random_state=6)
# Using the random forest model that we created using the training set to predict the x_test_sp object
y_pred_sp = rf_sp.predict(x_test_sp)
# Test and evaluate the model using a classification report, which compares the predictions of the x_test_sp and the ground truth data of y_test_sp
print(classification_report(y_test_sp, y_pred_sp))
precision recall f1-score support cotton 0.44 0.23 0.30 69 others 0.05 0.09 0.06 11 wheat 0.49 0.61 0.54 87 accuracy 0.42 167 macro avg 0.33 0.31 0.30 167 weighted avg 0.44 0.42 0.41 167
# Evaluate the accuracy for specific crop types using a confusion matrix
# Create a list of crop types
labels = ["cotton","others","wheat"]
# Create a confusion matrix
cm = confusion_matrix(y_test_sp, y_pred_sp)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(text_kw={"fontsize":10}, xticks_rotation="vertical")
plt.show()
# Compute permutational importance to determine the predictive ability of each feature
p_imp = permutation_importance(rf_sp, x_sp, y_sp, scoring="accuracy", n_repeats=30, random_state=6)
# Visualize the permutational importance of each predictor in a barplot
columns = x_sp.columns
p_imp = abs(p_imp["importances_mean"])
p_imp_df = pd.DataFrame({"feature": columns, "importance": p_imp})
p_imp_df = p_imp_df.sort_values(by=["importance"], ascending=True)
fig = px.bar(p_imp_df, y="feature", x="importance", height=600)
fig.show()
# Create a map comparing between predictions made by the model and ground-truth data
all_preds_sp = rf_sp.predict(x_sp)
Prediction_map = filtered_gdf.copy()
Prediction_map["predicted"] = all_preds_sp.astype("str")
Prediction_map.replace({"predicted": labels}, inplace=True)
basemap = "https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}"
attribution = "Tiles © Esri — Source: Esri, i-cubed, USDA, USGS, AEX, GeoEye, Getmapping, Aerogrid, IGN, IGP, UPR-EGP, and the GIS User Community"
Prediction_map.explore(column="predicted", cmap=["yellow","blue","red"], categorical=True, tiles=basemap, attr=attribution, tooltip=["crop_1", "predicted"])
The metrics used to assess model performance and why they are suitable for this sub-task.
We used the classification_report function to generate metrics, including precision, recall, f1-score and accuracy. Precision measures the proportion of correct positive predictions out of all positive predictions made by the model. Recall measures the proportion of true positive predictions out of all actual positive instances including false negatives. Both metrics help assess the model's ability to accurately predict positive instances (precision) and to avoid missing true positives (recall).
The f1-score combines precision and recall, making it a balanced assessment of the model's prediction as it acknowledges the consequences of both false positives and false negatives. The accuracy score, which is the proportion of correct predictions out of all predictions, provides an overall evaluation of the model, but may not accurately reflect how well the model predicts the minority classes. This is because the predictive performance for the majority classes tends to overshadow any effect of the predictions on the minority classes. Hence, F1-score is useful for such cases as it is computed for each class, revealing class-specific information that is not shown by the accuracy score.
We also created a confusion matrix, which provides a helpful visualization of the above metrics. It allows us to better understand class-specific performance by displaying the true positve, true negative, false positive and false negative predictions for each class. It also helps us detect class imbalance problems and identifies consistent patterns of errors. This information is valuable as it informs of how we can improve the model, for example, by applying different sampling techniques.
The strategy used to evaluate model performance and why it is suitable for this sub-task.
To evaluate the model, we created the training set and the test set from the NDVI-rededge data, so that our model can be trained with the training set and evaluated with the test set. We used the split() function from GroupShuffleSplit object from SciKit to split the data into training and test set in such a way that no data from the same area (Pol) is in the same set. This prevents data leakage from the test set into the training set and allows a more realistic model that is not biased by the data in the training set. We also ensured that both sets include all three crop classes by splitting the data into the training and test set in a ratio of 1:1, so that each set would have 2 areas of interest (more likely to have all classes of crops). Having all classes in both sets allowed us to gain insight of the predictive ability of the model for all crop classes.
When training the model with Random Forest Classifier, we kept the random-state consistent throughout, so that the evaluation metrics can be reproduced if the random-splitting of data were to be executed again. Finally, we computed the permutational importance to determine the predictive ability of each predictor. This informs us of how each predictor compares to one another in terms of their predictive performance, and whether we need to remove any insignificant or redundant predictors that would just create noise in the pattern.
The model’s suitability as a tool for generating crop type maps and suggestions for improving its predictive performance and adapting the model for deployment in new cropping landscapes.
Our model achieved an accuracy score of 42%, which falls below the desired threshold of 70% for a realistic and valuable predictive output. Although the model demonstrates a relatively high accuracy for wheat (high f1-score of 0.54), it is not suitable for predicting other crop types. Grouping vineyard, orchard and alfalfa into one class was attempted to reduce undersampling issues, but the f1-score for "others" class remained too low to be useful in predicting these crops in new cropping landscapes.
A study by Schulthess et al. found that the optimal results require at least 450 training samples for dominant classes and 40 training samples for the minority classes. To improve accuracy, we should aim to use at least 450 training samples for cotton and wheat and 50 training samples for other minority crop classes. Furthurmore, we can remove redundant and irrelevant features identified by the feature importance function to improve accuracy, reduce processing time and prevent over-fitting. Features with low permuation importance, such as ndvi_8, rededge_9 and rededge_8 can be eliminated, followed by re-evaluation to compare the new accuracy score with the original.
The confusion matrix reveals considerable confusion between cotton and wheat, possibly due to to the predictors for these crops being too similar. To address this, incorporating other features that exhibit more variation between the two crop types, such as MSAVI, can be beneficial. MSAVI is designed to replace NDVI and red-edge where they fail to provide accurate predictions due to inadequate vegetation cover or chlorophyll content.