code/02_empirical_analysis/02_true_positive_alignment.py · 25-09-ews-assessment

"""
Adjusting the true positive locations where their coordinate uncertainty is larger than the pixel size of MODIS (250m).

This script computes the size of the area of uncertainty based on the number of decimal places in the latitude coordinate. 
If this is larger than the MODIS pixel size, we search within the uncertainty area for the pixel that shows the largest drop in NDVI in the year of collapse
in comparison to the three years before that. This is then selected as the new true positive location.

The script requires the preprocessed zarr files as obtained from 01_data_preprocessing.py.
It can be run in test mode for only 2 locations, or in full mode for all locations.

Usage: python3 ./code/02_empirical_analysis/02_true_positive_alignment.py hammond_path hdf_input_folder zarr_input_folder test_mode

hammond_path: Path to the Hammond CSV file (default: "./data/raw/global_tree_mortality_database/25_03_28_GTM_full_database_resolve_biomes_2017.csv")
hdf_input_folder: Directory where the raw HDF files are stored (default: "./data/raw/MOD13Q1")
zarr_input_folder: Directory where the preprocessed zarr files are stored (default: "./data/intermediary/MOD13Q1_preprocessed_entry")
test_mode: If set to "test" (vs "full"), only process 5 locations for testing purposes (default: "test")

"""

from shapely import Point
from pyproj import Transformer
import geopandas as gpd
from datetime import datetime, timedelta
from tqdm.autonotebook import tqdm
import pandas as pd
import numpy as np
import os
import xarray as xr
import rioxarray
import pyproj
from affine import Affine
import math
import sys

#Replace print with a time stamp print
_print = print
def print(*args, **kwargs):
    import datetime
    _print(datetime.datetime.now(), *args, **kwargs)

#For all entries, compute x, y, minx, miny, maxx, maxy
def reproject_bounds(lon, lat, target_crs):
    #Get the crs and transform
    transformer = Transformer.from_crs('EPSG:4326',target_crs, always_xy=True)
    # Transform the coordinates
    x, y = transformer.transform(lon, lat)
    # Create a 25 km buffer
    buffer_geom = Point(x, y).buffer(10_000)
    # Get bounding box (minx, miny, maxx, maxy)
    minx, miny, maxx, maxy = buffer_geom.bounds
    return x, y, minx, miny, maxx, maxy


def coordinate_area(lat, decimal_places):
    # Approximate length of one degree of latitude in meters
    lat_length = 111_000  # ~111 km
    
    # Approximate length of one degree of longitude in meters (varies by latitude)
    lon_length = 111_000 * math.cos(math.radians(lat))
    
    # Precision in degrees based on decimal places
    precision = 10 ** (-decimal_places)
    
    # Area covered by the given decimal precision
    area = (precision * lat_length) * (precision * lon_length)  # in square meters
    
    return area

#Compute resistance on zarr
def resistance(df, collapse_year):
    df['resistance'] = df.NDVI.sel(time = slice(f'{collapse_year}-01-01', f'{collapse_year +1}-01-01')).mean(dim='time') - df.NDVI.sel(time = slice(f'{collapse_year-3}-01-01', f'{collapse_year}-01-01')).mean(dim='time')
    return df['resistance']


#Function to run the true positive selection for one id
def true_pos_selection(id, df_hammond, zarr_input_folder):
    print('load zarr...')
    df = xr.open_zarr(os.path.join(zarr_input_folder, f"entry_{id}.zarr"), consolidated = False)
    #Get direction of y-axis
    y_dir = df.y.values[0] - df.y.values[1]

    #Get year of disturbance
    print('get year of disturbance...')
    collapse_year = df_hammond.loc[df_hammond.entry_id == id, 'year_disturbance'].values[0]

    #Clip to just the region of interest
    print('clip to region of interest...')
    minx_local = df_hammond.loc[df_hammond.entry_id == id, 'x'].values[0] - df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
    maxx_local = df_hammond.loc[df_hammond.entry_id == id, 'x'].values[0] + df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
    miny_local = df_hammond.loc[df_hammond.entry_id == id, 'y'].values[0] - df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
    maxy_local = df_hammond.loc[df_hammond.entry_id == id, 'y'].values[0] + df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2
    #df = df.loc[(df.x >= minx_local) & (df.x <= maxx_local) & (df.y >= miny_local) & (df.y <= maxy_local)].reset_index(drop = True)
    if y_dir < 0:
        df = df.sel(x = slice(minx_local, maxx_local), y = slice(miny_local, maxy_local)).reset_coords(drop = True)
    else:
        df = df.sel(x = slice(minx_local, maxx_local), y = slice(maxy_local, miny_local)).reset_coords(drop = True)

    #Compute resistance on grouped dataset
    print('compute resistance...')
    df_indiv = resistance(df, collapse_year)

    # Flatten the array and find the index of the minimum
    print('get minimum coordinates...')
    flat_idx = df_indiv.argmin().compute().item()

    # Stack the array so we can map flat index back to coordinates
    stacked = df_indiv.stack(z=("y", "x"))
    coord_at_min = stacked.isel(z=flat_idx)

    # Extract values
    min_value = coord_at_min.compute().item()
    x_min = coord_at_min["x"].item()
    y_min = coord_at_min["y"].item()

    #Pick the new true positive point, add this to Hammond with the same information 
    new_point = df_hammond.loc[df_hammond.entry_id == id].copy()
    new_point[['x', 'y']] = [x_min, y_min]
    new_point['true_pos_neg'] = 'true_pos_adj'

    return new_point


def main(hammond_path, zarr_input_folder, test_mode):
    # Load Hammond
    print("loading and preprocessing everything...")
    #Load forest dieback dataset (Hammond, 2022)
    df_hammond = pd.read_csv(hammond_path).drop_duplicates()
    #Drop collapse events pre 1984
    df_hammond = df_hammond.loc[df_hammond.time > 1984].reset_index(drop=True)
    #Add an entry_id to this
    df_hammond['entry_id'] = df_hammond.index
    #For compatibility, also add a paired_id column
    df_hammond['paired_id'] = df_hammond.entry_id
    #Rename for consistency
    df_hammond = df_hammond.rename(columns={'time':'year_disturbance', 'long':'lon'})
    df_hammond['true_pos_neg'] = 'true_pos'
    #Remove all those with collapse pre-2001
    df_hammond = df_hammond.loc[df_hammond.year_disturbance > 2001].reset_index(drop=True)

    #If we run this in test mode, only keep 5 locations
    if test_mode == "test":
        df_hammond = df_hammond.sample(5, random_state = 42).reset_index(drop=True)
        print("Running in test mode, only processing 5 locations.")

    #Add a column with the accuracy of the location, based on the number of decimal places in x and y coordinates
    df_hammond['area'] = df_hammond.lat.apply(lambda x: coordinate_area(x, len(str(x).split('.')[1])))
    df_hammond['decimal_digits_lat'] = df_hammond.lat.apply(lambda x: len(str(x).split('.')[1]))
    #Get the pixel Kanten size for the area
    df_hammond['pixel_size'] = df_hammond.area.apply(lambda x: math.sqrt(x))

    #Apply reprojection to metric coordinates
    print("reprojecting coordinates...")
    with os.scandir(hdf_input_folder) as entries:
        for entry in entries:
            if ((entry.is_file()) and (entry.name.endswith('.hdf'))):
                first_file_path = entry.path
                break
    #Apply to all rows in df_hammond
    r1 = rioxarray.open_rasterio(first_file_path)
    target_crs = r1.rio.crs
    df_hammond['x'], df_hammond['y'], df_hammond['minx'], df_hammond['miny'], df_hammond['maxx'], df_hammond['maxy'] = zip(*df_hammond.apply(lambda x: reproject_bounds(x.lon, x.lat, target_crs), axis = 1))

    #Loop through all those entries that have pixel size of > 250m
    new_point_list = []
    for id in tqdm(df_hammond.loc[df_hammond.pixel_size > 250].entry_id, total = len(df_hammond.loc[df_hammond.pixel_size > 250].entry_id), desc = "Finding true positives", unit = "id"):
        print("evaluating id", id)
        new_point_list.append(true_pos_selection(id, df_hammond))
    
    #If this is not an empty list, concatenate
    if len(new_point_list) == 0:
        print("No points needed adjustment, exiting.")
        
    else:
        print(f"Found {len(new_point_list)} points that needed adjustment.")
        #Concatenate all new points to df_hammond
        df_hammond = pd.concat([df_hammond, pd.concat(new_point_list)], axis = 0).reset_index(drop = True)

    #Save this to a new csv
    print("saving to csv...")
    #Create intermediary folder if it does not exist
    os.makedirs(os.path.dirname(hammond_path.replace('.csv', '_with_true_pos_adjusted.csv').replace('raw', 'intermediary')), exist_ok = True)
    #Save to new path
    new_path_name = os.path.join(hammond_path.replace('.csv', '_with_true_pos_adjusted.csv').replace('raw', 'intermediary'))
    df_hammond.to_csv(new_path_name, index = False)

    print("done :)")

if __name__ == "__main__":
    #Get arguments
    hammond_path = sys.argv[1] if len(sys.argv) > 1 else  "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv"
    hdf_input_folder = sys.argv[2] if len(sys.argv) > 2 else "./data/raw/MOD13Q1"
    zarr_input_folder = sys.argv[3] if len(sys.argv) > 2 else "./data/intermediary/MOD13Q1_preprocessed_entry"
    test_mode = sys.argv[4] if len(sys.argv) > 4 else "test"  # "test" or "full"

    #Run the main function
    main(hammond_path, zarr_input_folder, test_mode)