""" Adjusting the true positive locations where their coordinate uncertainty is larger than the pixel size of MODIS (250m). This script computes the size of the area of uncertainty based on the number of decimal places in the latitude coordinate. If this is larger than the MODIS pixel size, we search within the uncertainty area for the pixel that shows the largest drop in NDVI in the year of collapse in comparison to the three years before that. This is then selected as the new true positive location. The script requires the preprocessed zarr files as obtained from 01_data_preprocessing.py. It can be run in test mode for only 2 locations, or in full mode for all locations. Usage: python3 ./code/02_empirical_analysis/02_true_positive_alignment.py hammond_path hdf_input_folder zarr_input_folder test_mode hammond_path: Path to the Hammond CSV file (default: "./data/raw/global_tree_mortality_database/25_03_28_GTM_full_database_resolve_biomes_2017.csv") hdf_input_folder: Directory where the raw HDF files are stored (default: "./data/raw/MOD13Q1") zarr_input_folder: Directory where the preprocessed zarr files are stored (default: "./data/intermediary/MOD13Q1_preprocessed_entry") test_mode: If set to "test" (vs "full"), only process 5 locations for testing purposes (default: "test") """ from shapely import Point from pyproj import Transformer import geopandas as gpd from datetime import datetime, timedelta from tqdm.autonotebook import tqdm import pandas as pd import numpy as np import os import xarray as xr import rioxarray import pyproj from affine import Affine import math import sys #Replace print with a time stamp print _print = print def print(*args, **kwargs): import datetime _print(datetime.datetime.now(), *args, **kwargs) #For all entries, compute x, y, minx, miny, maxx, maxy def reproject_bounds(lon, lat, target_crs): #Get the crs and transform transformer = Transformer.from_crs('EPSG:4326',target_crs, always_xy=True) # Transform the coordinates x, y = transformer.transform(lon, lat) # Create a 25 km buffer buffer_geom = Point(x, y).buffer(10_000) # Get bounding box (minx, miny, maxx, maxy) minx, miny, maxx, maxy = buffer_geom.bounds return x, y, minx, miny, maxx, maxy def coordinate_area(lat, decimal_places): # Approximate length of one degree of latitude in meters lat_length = 111_000 # ~111 km # Approximate length of one degree of longitude in meters (varies by latitude) lon_length = 111_000 * math.cos(math.radians(lat)) # Precision in degrees based on decimal places precision = 10 ** (-decimal_places) # Area covered by the given decimal precision area = (precision * lat_length) * (precision * lon_length) # in square meters return area #Compute resistance on zarr def resistance(df, collapse_year): df['resistance'] = df.NDVI.sel(time = slice(f'{collapse_year}-01-01', f'{collapse_year +1}-01-01')).mean(dim='time') - df.NDVI.sel(time = slice(f'{collapse_year-3}-01-01', f'{collapse_year}-01-01')).mean(dim='time') return df['resistance'] #Function to run the true positive selection for one id def true_pos_selection(id, df_hammond, zarr_input_folder): print('load zarr...') df = xr.open_zarr(os.path.join(zarr_input_folder, f"entry_{id}.zarr"), consolidated = False) #Get direction of y-axis y_dir = df.y.values[0] - df.y.values[1] #Get year of disturbance print('get year of disturbance...') collapse_year = df_hammond.loc[df_hammond.entry_id == id, 'year_disturbance'].values[0] #Clip to just the region of interest print('clip to region of interest...') minx_local = df_hammond.loc[df_hammond.entry_id == id, 'x'].values[0] - df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2 maxx_local = df_hammond.loc[df_hammond.entry_id == id, 'x'].values[0] + df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2 miny_local = df_hammond.loc[df_hammond.entry_id == id, 'y'].values[0] - df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2 maxy_local = df_hammond.loc[df_hammond.entry_id == id, 'y'].values[0] + df_hammond.loc[df_hammond.entry_id == id, 'pixel_size'].values[0]/2 #df = df.loc[(df.x >= minx_local) & (df.x <= maxx_local) & (df.y >= miny_local) & (df.y <= maxy_local)].reset_index(drop = True) if y_dir < 0: df = df.sel(x = slice(minx_local, maxx_local), y = slice(miny_local, maxy_local)).reset_coords(drop = True) else: df = df.sel(x = slice(minx_local, maxx_local), y = slice(maxy_local, miny_local)).reset_coords(drop = True) #Compute resistance on grouped dataset print('compute resistance...') df_indiv = resistance(df, collapse_year) # Flatten the array and find the index of the minimum print('get minimum coordinates...') flat_idx = df_indiv.argmin().compute().item() # Stack the array so we can map flat index back to coordinates stacked = df_indiv.stack(z=("y", "x")) coord_at_min = stacked.isel(z=flat_idx) # Extract values min_value = coord_at_min.compute().item() x_min = coord_at_min["x"].item() y_min = coord_at_min["y"].item() #Pick the new true positive point, add this to Hammond with the same information new_point = df_hammond.loc[df_hammond.entry_id == id].copy() new_point[['x', 'y']] = [x_min, y_min] new_point['true_pos_neg'] = 'true_pos_adj' return new_point def main(hammond_path, zarr_input_folder, test_mode): # Load Hammond print("loading and preprocessing everything...") #Load forest dieback dataset (Hammond, 2022) df_hammond = pd.read_csv(hammond_path).drop_duplicates() #Drop collapse events pre 1984 df_hammond = df_hammond.loc[df_hammond.time > 1984].reset_index(drop=True) #Add an entry_id to this df_hammond['entry_id'] = df_hammond.index #For compatibility, also add a paired_id column df_hammond['paired_id'] = df_hammond.entry_id #Rename for consistency df_hammond = df_hammond.rename(columns={'time':'year_disturbance', 'long':'lon'}) df_hammond['true_pos_neg'] = 'true_pos' #Remove all those with collapse pre-2001 df_hammond = df_hammond.loc[df_hammond.year_disturbance > 2001].reset_index(drop=True) #If we run this in test mode, only keep 5 locations if test_mode == "test": df_hammond = df_hammond.sample(5, random_state = 42).reset_index(drop=True) print("Running in test mode, only processing 5 locations.") #Add a column with the accuracy of the location, based on the number of decimal places in x and y coordinates df_hammond['area'] = df_hammond.lat.apply(lambda x: coordinate_area(x, len(str(x).split('.')[1]))) df_hammond['decimal_digits_lat'] = df_hammond.lat.apply(lambda x: len(str(x).split('.')[1])) #Get the pixel Kanten size for the area df_hammond['pixel_size'] = df_hammond.area.apply(lambda x: math.sqrt(x)) #Apply reprojection to metric coordinates print("reprojecting coordinates...") with os.scandir(hdf_input_folder) as entries: for entry in entries: if ((entry.is_file()) and (entry.name.endswith('.hdf'))): first_file_path = entry.path break #Apply to all rows in df_hammond r1 = rioxarray.open_rasterio(first_file_path) target_crs = r1.rio.crs df_hammond['x'], df_hammond['y'], df_hammond['minx'], df_hammond['miny'], df_hammond['maxx'], df_hammond['maxy'] = zip(*df_hammond.apply(lambda x: reproject_bounds(x.lon, x.lat, target_crs), axis = 1)) #Loop through all those entries that have pixel size of > 250m new_point_list = [] for id in tqdm(df_hammond.loc[df_hammond.pixel_size > 250].entry_id, total = len(df_hammond.loc[df_hammond.pixel_size > 250].entry_id), desc = "Finding true positives", unit = "id"): print("evaluating id", id) new_point_list.append(true_pos_selection(id, df_hammond)) #If this is not an empty list, concatenate if len(new_point_list) == 0: print("No points needed adjustment, exiting.") else: print(f"Found {len(new_point_list)} points that needed adjustment.") #Concatenate all new points to df_hammond df_hammond = pd.concat([df_hammond, pd.concat(new_point_list)], axis = 0).reset_index(drop = True) #Save this to a new csv print("saving to csv...") #Create intermediary folder if it does not exist os.makedirs(os.path.dirname(hammond_path.replace('.csv', '_with_true_pos_adjusted.csv').replace('raw', 'intermediary')), exist_ok = True) #Save to new path new_path_name = os.path.join(hammond_path.replace('.csv', '_with_true_pos_adjusted.csv').replace('raw', 'intermediary')) df_hammond.to_csv(new_path_name, index = False) print("done :)") if __name__ == "__main__": #Get arguments hammond_path = sys.argv[1] if len(sys.argv) > 1 else "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv" hdf_input_folder = sys.argv[2] if len(sys.argv) > 2 else "./data/raw/MOD13Q1" zarr_input_folder = sys.argv[3] if len(sys.argv) > 2 else "./data/intermediary/MOD13Q1_preprocessed_entry" test_mode = sys.argv[4] if len(sys.argv) > 4 else "test" # "test" or "full" #Run the main function main(hammond_path, zarr_input_folder, test_mode)