"""
Adjusting the true positive locations where their coordinate uncertainty is larger than the pixel size of MODIS (250m).
This script computes the size of the area of uncertainty based on the number of decimal places in the latitude coordinate.
If this is larger than the MODIS pixel size, we search within the uncertainty area for the pixel that shows the largest drop in NDVI in the year of collapse
in comparison to the three years before that. This is then selected as the new true positive location.
The script requires the preprocessed zarr files as obtained from 01_data_preprocessing.py.
It can be run in test mode for only 2 locations, or in full mode for all locations.
Usage: python3 ./code/02_empirical_analysis/02_true_positive_alignment.py hammond_path hdf_input_folder zarr_input_folder test_mode
hammond_path: Path to the Hammond CSV file (default: "./data/raw/global_tree_mortality_database/25_03_28_GTM_full_database_resolve_biomes_2017.csv")
hdf_input_folder: Directory where the raw HDF files are stored (default: "./data/raw/MOD13Q1")
zarr_input_folder: Directory where the preprocessed zarr files are stored (default: "./data/intermediary/MOD13Q1_preprocessed_entry")
test_mode: If set to "test" (vs "full"), only process 5 locations for testing purposes (default: "test")
"""
from shapely import Point
from pyproj import Transformer
import geopandas as gpd
from datetime import datetime, timedelta
from tqdm.autonotebook import tqdm
import pandas as pd
import numpy as np
import os
import xarray as xr
import rioxarray
import pyproj
from affine import Affine
import math
import sys
#Replace print with a time stamp print
_print = print
def print(*args, **kwargs):
import datetime
_print(datetime.datetime.now(), *args, **kwargs)
#For all entries, compute x, y, minx, miny, maxx, maxy
def reproject_bounds(lon, lat, target_crs):
#Get the crs and transform
transformer = Transformer.from_crs('EPSG:4326',target_crs, always_xy=True)
# Transform the coordinates
x, y = transformer.transform(lon, lat)
# Create a 25 km buffer
buffer_geom = Point(x, y).buffer(10_000)
# Get bounding box (minx, miny, maxx, maxy)
minx, miny, maxx, maxy = buffer_geom.bounds
return x, y, minx, miny, maxx, maxy
def coordinate_area(lat, decimal_places):
# Approximate length of one degree of latitude in meters
lat_length = 111_000 # ~111 km
# Approximate length of one degree of longitude in meters (varies by latitude)
lon_length = 111_000 * math.cos(math.radians(lat))
# Precision in degrees based on decimal places
precision = 10 ** (-decimal_places)
# Area covered by the given decimal precision
area = (precision * lat_length) * (precision * lon_length) # in square meters
return area
#Compute resistance on zarr
def resistance(df, collapse_year):
df['resistance'] = df.NDVI.sel(time = slice(f'{collapse_year}-01-01', f'{collapse_year +1}-01-01')).mean(dim='time') - df.NDVI.sel(time = slice(f'{collapse_year-3}-01-01', f'{collapse_year}-01-01')).mean(dim='time')
return df['resistance']
#Function to run the true positive selection for one id
def true_pos_selection(id, df_hammond, zarr_input_folder):
print('load zarr...')
df = xr.open_zarr(os.path.join(zarr_input_folder, f"entry_{id}.zarr"), consolidated = False)
#Get direction of y-axis
y_dir = df.y.values[0] - df.y.values[1]
#Get year of disturbance
print('get year of disturbance...')
collapse_year = df_hammond.loc[df_hammond.entry_id == id, 'year_disturbance'].values[0]
#Clip to just the region of interest
print('clip to region of interest...')
minx_local = df_hammond.loc[df_hammond.entry_id == id, 'x'].values[0] - df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
maxx_local = df_hammond.loc[df_hammond.entry_id == id, 'x'].values[0] + df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
miny_local = df_hammond.loc[df_hammond.entry_id == id, 'y'].values[0] - df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
maxy_local = df_hammond.loc[df_hammond.entry_id == id, 'y'].values[0] + df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
#df = df.loc[(df.x >= minx_local) & (df.x <= maxx_local) & (df.y >= miny_local) & (df.y <= maxy_local)].reset_index(drop = True)
if y_dir < 0:
df = df.sel(x = slice(minx_local, maxx_local), y = slice(miny_local, maxy_local)).reset_coords(drop = True)
else:
df = df.sel(x = slice(minx_local, maxx_local), y = slice(maxy_local, miny_local)).reset_coords(drop = True)
#Compute resistance on grouped dataset
print('compute resistance...')
df_indiv = resistance(df, collapse_year)
# Flatten the array and find the index of the minimum
print('get minimum coordinates...')
flat_idx = df_indiv.argmin().compute().item()
# Stack the array so we can map flat index back to coordinates
stacked = df_indiv.stack(z=("y", "x"))
coord_at_min = stacked.isel(z=flat_idx)
# Extract values
min_value = coord_at_min.compute().item()
x_min = coord_at_min["x"].item()
y_min = coord_at_min["y"].item()
#Pick the new true positive point, add this to Hammond with the same information
new_point = df_hammond.loc[df_hammond.entry_id == id].copy()
new_point[['x', 'y']] = [x_min, y_min]
new_point['true_pos_neg'] = 'true_pos_adj'
return new_point
def main(hammond_path, zarr_input_folder, test_mode):
# Load Hammond
print("loading and preprocessing everything...")
#Load forest dieback dataset (Hammond, 2022)
df_hammond = pd.read_csv(hammond_path).drop_duplicates()
#Drop collapse events pre 1984
df_hammond = df_hammond.loc[df_hammond.time > 1984].reset_index(drop=True)
#Add an entry_id to this
df_hammond['entry_id'] = df_hammond.index
#For compatibility, also add a paired_id column
df_hammond['paired_id'] = df_hammond.entry_id
#Rename for consistency
df_hammond = df_hammond.rename(columns={'time':'year_disturbance', 'long':'lon'})
df_hammond['true_pos_neg'] = 'true_pos'
#Remove all those with collapse pre-2001
df_hammond = df_hammond.loc[df_hammond.year_disturbance > 2001].reset_index(drop=True)
#If we run this in test mode, only keep 5 locations
if test_mode == "test":
df_hammond = df_hammond.sample(5, random_state = 42).reset_index(drop=True)
print("Running in test mode, only processing 5 locations.")
#Add a column with the accuracy of the location, based on the number of decimal places in x and y coordinates
df_hammond['area'] = df_hammond.lat.apply(lambda x: coordinate_area(x, len(str(x).split('.')[1])))
df_hammond['decimal_digits_lat'] = df_hammond.lat.apply(lambda x: len(str(x).split('.')[1]))
#Get the pixel Kanten size for the area
df_hammond['pixel_size'] = df_hammond.area.apply(lambda x: math.sqrt(x))
#Apply reprojection to metric coordinates
print("reprojecting coordinates...")
with os.scandir(hdf_input_folder) as entries:
for entry in entries:
if ((entry.is_file()) and (entry.name.endswith('.hdf'))):
first_file_path = entry.path
break
#Apply to all rows in df_hammond
r1 = rioxarray.open_rasterio(first_file_path)
target_crs = r1.rio.crs
df_hammond['x'], df_hammond['y'], df_hammond['minx'], df_hammond['miny'], df_hammond['maxx'], df_hammond['maxy'] = zip(*df_hammond.apply(lambda x: reproject_bounds(x.lon, x.lat, target_crs), axis = 1))
#Loop through all those entries that have pixel size of > 250m
new_point_list = []
for id in tqdm(df_hammond.loc[df_hammond.pixel_size > 250].entry_id, total = len(df_hammond.loc[df_hammond.pixel_size > 250].entry_id), desc = "Finding true positives", unit = "id"):
print("evaluating id", id)
new_point_list.append(true_pos_selection(id, df_hammond))
#If this is not an empty list, concatenate
if len(new_point_list) == 0:
print("No points needed adjustment, exiting.")
else:
print(f"Found {len(new_point_list)} points that needed adjustment.")
#Concatenate all new points to df_hammond
df_hammond = pd.concat([df_hammond, pd.concat(new_point_list)], axis = 0).reset_index(drop = True)
#Save this to a new csv
print("saving to csv...")
#Create intermediary folder if it does not exist
os.makedirs(os.path.dirname(hammond_path.replace('.csv', '_with_true_pos_adjusted.csv').replace('raw', 'intermediary')), exist_ok = True)
#Save to new path
new_path_name = os.path.join(hammond_path.replace('.csv', '_with_true_pos_adjusted.csv').replace('raw', 'intermediary'))
df_hammond.to_csv(new_path_name, index = False)
print("done :)")
if __name__ == "__main__":
#Get arguments
hammond_path = sys.argv[1] if len(sys.argv) > 1 else "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv"
hdf_input_folder = sys.argv[2] if len(sys.argv) > 2 else "./data/raw/MOD13Q1"
zarr_input_folder = sys.argv[3] if len(sys.argv) > 2 else "./data/intermediary/MOD13Q1_preprocessed_entry"
test_mode = sys.argv[4] if len(sys.argv) > 4 else "test" # "test" or "full"
#Run the main function
main(hammond_path, zarr_input_folder, test_mode)