""" Download of MODIS files for Hammond ground truthing locations using earthaccess This script uses the earthaccess library to search and download MODIS MOD13Q1 data (in hdf4 format) for locations specified in the Hammond global tree mortality database. It checks for existing files to avoid duplicates and saves new downloads to a specified directory. This script can be run in test mode for only 2 locations, or in full mode for all locations. NOTE: You must have a NASA Earthdata account to run this script. You can create one for free at https://urs.earthdata.nasa.gov/ When running this script for the first time, you will be prompted to log in to your Earthdata account. Running this script in test mode takes around 20 minutes to complete. Usage: python3 ./code/02_empirical_analysis/00_data_download_earthaccess.py hammond_path output_dir test_mode hammond_path: Path to the Hammond CSV file (default: "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv") output_dir: Directory to save downloaded MODIS files (default: "./data/raw/MOD13Q1") test_mode: If set to "test", only process 5 random locations for testing purposes (default: "full") """ import earthpy import earthaccess from shapely import Point from pyproj import Transformer import geopandas as gpd from datetime import datetime, timedelta from tqdm import tqdm import os import pandas as pd from pathos.pools import ProcessPool from dask.diagnostics import ProgressBar import sys #Replace print with a time stamp print _print = print def print(*args, **kwargs): import datetime _print(datetime.datetime.now(), *args, **kwargs) #Main function def main(hammond_path, output_dir, test_mode): #Using earthpy, find all relevant MODIS MOD13Q1 tiles at the location of interest # Authenticate with Earthdata earthaccess.login(strategy = 'interactive', persist = True) print("loading Hammond data...") #Load Hammond data and preprocess for MODIS relevant data df_hammond = pd.read_csv(hammond_path).drop_duplicates() #Drop collapse events pre 1984 df_hammond = df_hammond.loc[df_hammond.time > 1984].reset_index(drop=True) #Add an entry_id to this df_hammond['entry_id'] = df_hammond.index #For compatibility, also add a paired_id column df_hammond['paired_id'] = df_hammond.entry_id #Rename for consistency df_hammond = df_hammond.rename(columns={'time':'year_disturbance', 'long':'lon'}) df_hammond['true_pos_neg'] = 'true_pos' #Remove all those with collapse pre-2001 df_hammond = df_hammond.loc[df_hammond.year_disturbance > 2001].reset_index(drop=True) #If we run this in test mode, only keep 5 locations if test_mode == "test": df_hammond = df_hammond.sample(5, random_state = 42) print("Running in test mode, only processing 5 locations:", df_hammond.entry_id.tolist()) print('checking for existing files...') #Get list of files already in output directory, if it exists if os.path.exists(output_dir): files = os.listdir(output_dir) else: os.makedirs(output_dir, exist_ok=True) files = [] print('searching for new files...') # Loop through all locations with a progress bar if the text file does not already exist if not os.path.exists(f"{output_dir}/MOD13Q1_results.txt"): res_list = [] for lon, lat in tqdm(zip(df_hammond.lon, df_hammond.lat), total=len(df_hammond), desc="Searching MODIS Data"): results = earthaccess.search_data( short_name='MOD13Q1', bounding_box=(lon - 0.25, lat - 0.25, lon + 0.25, lat + 0.25), temporal=('2001-01-01', '2025-01-01'), cloud_hosted=True ) # Append the data links [res_list.append(r.data_links()) for r in results] #Flatten the list res_list = [item for sublist in res_list for item in sublist] #Keep only .hdf files res_list = [r.rstrip() for r in res_list if r.endswith('.hdf')] res_length = len(res_list) #Remove those that are already in folder res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files] print(f'Found {len(res_list)} new files to download out of {res_length} total files found') #Save the results to a file with open(f"{output_dir}/MOD13Q1_results.txt", "w") as f: for item in res_list: f.write(f"{item}\n") #Read it again with open(f"{output_dir}/MOD13Q1_results.txt", "r") as f: res_list = f.readlines() res_list = [r.rstrip() for r in res_list] res_list.reverse() orig_len = len(res_list) #Remove those that are already in folder res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files] print(f'After re-checking, {len(res_list)} new files to download out of {orig_len} total files found') else: with open(f"{output_dir}/MOD13Q1_results.txt", "r") as f: res_list = f.readlines() res_list = [r.rstrip() for r in res_list] res_list.reverse() orig_len = len(res_list) #Remove those that are already in folder res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files] print(f'Found existing results file, {len(res_list)} new files to download out of {orig_len} total files found') #Double check that the results are not in the folder already print(f'Found {len(res_list)} new files to download') print(res_list[:5]) if not len(res_list) == 0: print('downloading new files...') #Download all files into a new output directory files = earthaccess.download(res_list, output_dir, threads = 16) print("complete") if __name__ == "__main__": # Get Hammond path and output directory from user or set defaults #Local run setup #hammond_path = sys.argv[1] if len(sys.argv) > 1 else "/home/nielja/USBdisc/data/on_the_ground/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv" #output_dir = sys.argv[2] if len(sys.argv) > 2 else "/home/nielja/USBdisc/data/25_03_27_MOD13Q1_groundtruthing" #General setup hammond_path = sys.argv[1] if len(sys.argv) > 1 else "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv" output_dir = sys.argv[2] if len(sys.argv) > 2 else "./data/raw/MOD13Q1" test_mode = sys.argv[3] if len(sys.argv) > 3 else "test" # "test" or "full" #Run script main(hammond_path, output_dir, test_mode)