"""
Download of MODIS files for Hammond ground truthing locations using earthaccess
This script uses the earthaccess library to search and download MODIS MOD13Q1 data (in hdf4 format) for locations
specified in the Hammond global tree mortality database.
It checks for existing files to avoid duplicates and saves new downloads to a specified directory.
This script can be run in test mode for only 2 locations, or in full mode for all locations.
NOTE: You must have a NASA Earthdata account to run this script. You can create one for free at https://urs.earthdata.nasa.gov/
When running this script for the first time, you will be prompted to log in to your Earthdata account.
Running this script in test mode takes around 20 minutes to complete.
Usage: python3 ./code/02_empirical_analysis/00_data_download_earthaccess.py hammond_path output_dir test_mode
hammond_path: Path to the Hammond CSV file (default: "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv")
output_dir: Directory to save downloaded MODIS files (default: "./data/raw/MOD13Q1")
test_mode: If set to "test", only process 5 random locations for testing purposes (default: "full")
"""
import earthpy
import earthaccess
from shapely import Point
from pyproj import Transformer
import geopandas as gpd
from datetime import datetime, timedelta
from tqdm import tqdm
import os
import pandas as pd
from pathos.pools import ProcessPool
from dask.diagnostics import ProgressBar
import sys
#Replace print with a time stamp print
_print = print
def print(*args, **kwargs):
import datetime
_print(datetime.datetime.now(), *args, **kwargs)
#Main function
def main(hammond_path, output_dir, test_mode):
#Using earthpy, find all relevant MODIS MOD13Q1 tiles at the location of interest
# Authenticate with Earthdata
earthaccess.login(strategy = 'interactive', persist = True)
print("loading Hammond data...")
#Load Hammond data and preprocess for MODIS relevant data
df_hammond = pd.read_csv(hammond_path).drop_duplicates()
#Drop collapse events pre 1984
df_hammond = df_hammond.loc[df_hammond.time > 1984].reset_index(drop=True)
#Add an entry_id to this
df_hammond['entry_id'] = df_hammond.index
#For compatibility, also add a paired_id column
df_hammond['paired_id'] = df_hammond.entry_id
#Rename for consistency
df_hammond = df_hammond.rename(columns={'time':'year_disturbance', 'long':'lon'})
df_hammond['true_pos_neg'] = 'true_pos'
#Remove all those with collapse pre-2001
df_hammond = df_hammond.loc[df_hammond.year_disturbance > 2001].reset_index(drop=True)
#If we run this in test mode, only keep 5 locations
if test_mode == "test":
df_hammond = df_hammond.sample(5, random_state = 42)
print("Running in test mode, only processing 5 locations:", df_hammond.entry_id.tolist())
print('checking for existing files...')
#Get list of files already in output directory, if it exists
if os.path.exists(output_dir):
files = os.listdir(output_dir)
else:
os.makedirs(output_dir, exist_ok=True)
files = []
print('searching for new files...')
# Loop through all locations with a progress bar if the text file does not already exist
if not os.path.exists(f"{output_dir}/MOD13Q1_results.txt"):
res_list = []
for lon, lat in tqdm(zip(df_hammond.lon, df_hammond.lat), total=len(df_hammond), desc="Searching MODIS Data"):
results = earthaccess.search_data(
short_name='MOD13Q1',
bounding_box=(lon - 0.25, lat - 0.25, lon + 0.25, lat + 0.25),
temporal=('2001-01-01', '2025-01-01'),
cloud_hosted=True
)
# Append the data links
[res_list.append(r.data_links()) for r in results]
#Flatten the list
res_list = [item for sublist in res_list for item in sublist]
#Keep only .hdf files
res_list = [r.rstrip() for r in res_list if r.endswith('.hdf')]
res_length = len(res_list)
#Remove those that are already in folder
res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files]
print(f'Found {len(res_list)} new files to download out of {res_length} total files found')
#Save the results to a file
with open(f"{output_dir}/MOD13Q1_results.txt", "w") as f:
for item in res_list:
f.write(f"{item}\n")
#Read it again
with open(f"{output_dir}/MOD13Q1_results.txt", "r") as f:
res_list = f.readlines()
res_list = [r.rstrip() for r in res_list]
res_list.reverse()
orig_len = len(res_list)
#Remove those that are already in folder
res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files]
print(f'After re-checking, {len(res_list)} new files to download out of {orig_len} total files found')
else:
with open(f"{output_dir}/MOD13Q1_results.txt", "r") as f:
res_list = f.readlines()
res_list = [r.rstrip() for r in res_list]
res_list.reverse()
orig_len = len(res_list)
#Remove those that are already in folder
res_list = [r for r in res_list if r.rstrip().split('/')[-1] not in files]
print(f'Found existing results file, {len(res_list)} new files to download out of {orig_len} total files found')
#Double check that the results are not in the folder already
print(f'Found {len(res_list)} new files to download')
print(res_list[:5])
if not len(res_list) == 0:
print('downloading new files...')
#Download all files into a new output directory
files = earthaccess.download(res_list, output_dir, threads = 16)
print("complete")
if __name__ == "__main__":
# Get Hammond path and output directory from user or set defaults
#Local run setup
#hammond_path = sys.argv[1] if len(sys.argv) > 1 else "/home/nielja/USBdisc/data/on_the_ground/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv"
#output_dir = sys.argv[2] if len(sys.argv) > 2 else "/home/nielja/USBdisc/data/25_03_27_MOD13Q1_groundtruthing"
#General setup
hammond_path = sys.argv[1] if len(sys.argv) > 1 else "./data/raw/global_tree_mortality_database/GTM_full_database_resolve_biomes_2017.csv"
output_dir = sys.argv[2] if len(sys.argv) > 2 else "./data/raw/MOD13Q1"
test_mode = sys.argv[3] if len(sys.argv) > 3 else "test" # "test" or "full"
#Run script
main(hammond_path, output_dir, test_mode)