code/02_empirical_analysis/00_data_download_earthaccess.py · 25-09-ews-assessment

"""
Download of MODIS files for Hammond ground truthing locations using earthaccess

This script uses the earthaccess library to search and download MODIS MOD13Q1 data (in hdf4 format) for locations 
specified in the Hammond global tree mortality database. 
It checks for existing files to avoid duplicates and saves new downloads to a specified directory.

This script can be run in test mode for only 2 locations, or in full mode for all locations.

NOTE: You must have a NASA Earthdata account to run this script. You can create one for free at https://urs.earthdata.nasa.gov/ 
When running this script for the first time, you will be prompted to log in to your Earthdata account.

Running this script in test mode takes around 20 minutes to complete.

Usage: python3 ./code/02_empirical_analysis/00_data_download_earthaccess.py hammond_path output_dir test_mode

hammond_path: Path to the Hammond CSV file (default: "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv")
output_dir: Directory to save downloaded MODIS files (default: "./data/raw/MOD13Q1")
test_mode: If set to "test", only process 5 random locations for testing purposes (default: "full")
"""

import earthpy
import earthaccess
from shapely import Point
from pyproj import Transformer
import geopandas as gpd
from datetime import datetime, timedelta
from tqdm import tqdm
import os
import pandas as pd
from pathos.pools import ProcessPool
from dask.diagnostics import ProgressBar
import sys

#Replace print with a time stamp print
_print = print
def print(*args, **kwargs):
    import datetime
    _print(datetime.datetime.now(), *args, **kwargs)

#Main function
def main(hammond_path, output_dir, test_mode):
    #Using earthpy, find all relevant MODIS MOD13Q1 tiles at the location of interest
    # Authenticate with Earthdata
    earthaccess.login(strategy = 'interactive', persist = True)

    print("loading Hammond data...")
    #Load Hammond data and preprocess for MODIS relevant data
    df_hammond = pd.read_csv(hammond_path).drop_duplicates()
    #Drop collapse events pre 1984
    df_hammond = df_hammond.loc[df_hammond.time > 1984].reset_index(drop=True)
    #Add an entry_id to this
    df_hammond['entry_id'] = df_hammond.index
    #For compatibility, also add a paired_id column
    df_hammond['paired_id'] = df_hammond.entry_id
    #Rename for consistency
    df_hammond = df_hammond.rename(columns={'time':'year_disturbance', 'long':'lon'})
    df_hammond['true_pos_neg'] = 'true_pos'
    #Remove all those with collapse pre-2001
    df_hammond = df_hammond.loc[df_hammond.year_disturbance > 2001].reset_index(drop=True)

    #If we run this in test mode, only keep 5 locations
    if test_mode == "test":
        df_hammond = df_hammond.sample(5, random_state = 42)
        print("Running in test mode, only processing 5 locations:", df_hammond.entry_id.tolist())

    print('checking for existing files...')
    #Get list of files already in output directory, if it exists
    if os.path.exists(output_dir):
        files = os.listdir(output_dir)
    else:
        os.makedirs(output_dir, exist_ok=True)
        files = []

    print('searching for new files...')
    # Loop through all locations with a progress bar if the text file does not already exist
    if not os.path.exists(f"{output_dir}/MOD13Q1_results.txt"):
        res_list = []
        for lon, lat in tqdm(zip(df_hammond.lon, df_hammond.lat), total=len(df_hammond), desc="Searching MODIS Data"):
            results = earthaccess.search_data(
                short_name='MOD13Q1',
                bounding_box=(lon - 0.25, lat - 0.25, lon + 0.25, lat + 0.25),
                temporal=('2001-01-01', '2025-01-01'),
                cloud_hosted=True
            )
            
            # Append the data links
            [res_list.append(r.data_links()) for r in results]

        #Flatten the list
        res_list = [item for sublist in res_list for item in sublist]
        #Keep only .hdf files
        res_list = [r.rstrip() for r in res_list if r.endswith('.hdf')]
        res_length = len(res_list)
        #Remove those that are already in folder
        res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files]
        print(f'Found {len(res_list)} new files to download out of {res_length} total files found')
        #Save the results to a file
        with open(f"{output_dir}/MOD13Q1_results.txt", "w") as f:
            for item in res_list:
                f.write(f"{item}\n")

        #Read it again
        with open(f"{output_dir}/MOD13Q1_results.txt", "r") as f:
            res_list = f.readlines()
            res_list = [r.rstrip() for r in res_list]
            res_list.reverse()
            orig_len = len(res_list)
            #Remove those that are already in folder
            res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files]
            print(f'After re-checking, {len(res_list)} new files to download out of {orig_len} total files found')
    else:
        with open(f"{output_dir}/MOD13Q1_results.txt", "r") as f:
            res_list = f.readlines()
            res_list = [r.rstrip() for r in res_list]
            res_list.reverse()
            orig_len = len(res_list)
            #Remove those that are already in folder
            res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files]
            print(f'Found existing results file, {len(res_list)} new files to download out of {orig_len} total files found')
    
    #Double check that the results are not in the folder already
    print(f'Found {len(res_list)} new files to download')
    print(res_list[:5])

    if not len(res_list) == 0:
        print('downloading new files...')
        #Download all files into a new output directory
        files = earthaccess.download(res_list, output_dir, threads = 16)
    print("complete")

if __name__ == "__main__":

    # Get Hammond path and output directory from user or set defaults

    #Local run setup
    #hammond_path = sys.argv[1] if len(sys.argv) > 1 else "/home/nielja/USBdisc/data/on_the_ground/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv"
    #output_dir = sys.argv[2] if len(sys.argv) > 2 else "/home/nielja/USBdisc/data/25_03_27_MOD13Q1_groundtruthing"

    #General setup
    hammond_path = sys.argv[1] if len(sys.argv) > 1 else "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv"
    output_dir = sys.argv[2] if len(sys.argv) > 2 else "./data/raw/MOD13Q1"
    test_mode = sys.argv[3] if len(sys.argv) > 3 else "test"  # "test" or "full"

    #Run script
    main(hammond_path, output_dir, test_mode)