In [1]:
from climsim_utils.data_utils import *

2024-06-24 09:51:18.574691: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-24 09:51:18.574721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-24 09:51:18.576226: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-24 09:51:18.583996: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Instantiating class

The example below will save training data in both .h5 and .npy format. Adjust if you only need one format. Also adjust input_abbrev to the input data files you will use. We expanded the original '.mli.' input files to include additional features such as previous steps' information, and '.mlexpand.' was just an arbitrary name we used for the expanded input files.

Currently the training script would assume the training set is in .h5 format while the validation set is in .npy form. It's fine to only keep save_h5=True in the block below for generating training data.

In [7]:
grid_path = '/global/u2/z/zeyuanhu/nvidia_codes/Climsim_private/grid_info/ClimSim_low-res_grid-info.nc'
norm_path = '/global/u2/z/zeyuanhu/nvidia_codes/Climsim_private/preprocessing/normalizations/'

grid_info = xr.open_dataset(grid_path)
input_mean = xr.open_dataset(norm_path + 'inputs/input_mean_v5_pervar.nc')
input_max = xr.open_dataset(norm_path + 'inputs/input_max_v5_pervar.nc')
input_min = xr.open_dataset(norm_path + 'inputs/input_min_v5_pervar.nc')
output_scale = xr.open_dataset(norm_path + 'outputs/output_scale_std_lowerthred_v5.nc')

data = data_utils(grid_info = grid_info, 
                  input_mean = input_mean, 
                  input_max = input_max, 
                  input_min = input_min, 
                  output_scale = output_scale,
                  input_abbrev = 'mlexpand',
                  output_abbrev = 'mlo',
                  normalize=False,
                  save_h5=True,
                  save_npy=True
                  )

In [8]:
# set data path
data.data_path = '/global/homes/z/zeyuanhu/scratch/hugging/E3SM-MMF_ne4/train/'

# set inputs and outputs to V5 subset
data.set_to_v5_vars()

In [9]:
data.input_vars

['state_t',
 'state_rh',
 'state_qn',
 'liq_partition',
 'state_u',
 'state_v',
 'state_t_dyn',
 'state_q0_dyn',
 'state_u_dyn',
 'tm_state_t_dyn',
 'tm_state_q0_dyn',
 'tm_state_u_dyn',
 'state_t_prvphy',
 'state_q0001_prvphy',
 'state_qn_prvphy',
 'state_u_prvphy',
 'tm_state_t_prvphy',
 'tm_state_q0001_prvphy',
 'tm_state_qn_prvphy',
 'tm_state_u_prvphy',
 'pbuf_ozone',
 'pbuf_CH4',
 'pbuf_N2O',
 'state_ps',
 'pbuf_SOLIN',
 'pbuf_LHFLX',
 'pbuf_SHFLX',
 'pbuf_TAUX',
 'pbuf_TAUY',
 'pbuf_COSZRS',
 'cam_in_ALDIF',
 'cam_in_ALDIR',
 'cam_in_ASDIF',
 'cam_in_ASDIR',
 'cam_in_LWUP',
 'cam_in_ICEFRAC',
 'cam_in_LANDFRAC',
 'cam_in_OCNFRAC',
 'cam_in_SNOWHICE',
 'cam_in_SNOWHLAND',
 'tm_state_ps',
 'tm_pbuf_SOLIN',
 'tm_pbuf_LHFLX',
 'tm_pbuf_SHFLX',
 'tm_pbuf_COSZRS',
 'clat',
 'slat',
 'icol']

### Create training data

Below is an example of creating the training data by integrating the 7 year climsim simulation data. A subsampling of 1000 is used as an example. In the actual work we did, we used a stride_sample=1. We could not fit the full 7-year data into the memory wihout subsampling. If that's also the case for you, try to only process a subset of data at one time by adjusting regexps in set_regexps method. We saved 14 separate input .h5 files. For each year, we saved two files by setting start_idx=0 or 1. For each year, we saved two files by setting start_idx=0 or 1. We have a folder like v5_full, which includes 14 subfolders named '11', '12', '21', '22', ..., '71','72', and each subfolder contains a train_input.h5 and train_target.h5. How you split to save training data won't influence the training. The training script will read in all the samples and randomly select samples across all the samples to form each batch.

In [11]:
# set regular expressions for selecting training data
data.set_regexps(data_split = 'train', 
                regexps = ['E3SM-MMF.mlexpand.000[1234567]-*-*-*.nc', # years 1 through 7
                        'E3SM-MMF.mlexpand.0008-01-*-*.nc']) # first month of year 8
# set temporal subsampling
data.set_stride_sample(data_split = 'train', stride_sample = 1000)
# create list of files to extract data from
data.set_filelist(data_split = 'train', start_idx=0)
# save numpy files of training data
data.save_as_npy(data_split = 'train', save_path = '/global/homes/z/zeyuanhu/scratch/hugging/E3SM-MMF_ne4/preprocessing/v5_example/')

### Create validation data

In [13]:
# set regular expressions for selecting validation data
data.set_regexps(data_split = 'val',
                 regexps = ['E3SM-MMF.mlexpand.0008-0[23456789]-*-*.nc', # months 2 through 9 of year 8
                            'E3SM-MMF.mlexpand.0008-1[012]-*-*.nc', # months 10 through 12 of year 8
                            'E3SM-MMF.mlexpand.0009-01-*-*.nc']) # first month of year 9
# set temporal subsampling
# data.set_stride_sample(data_split = 'val', stride_sample = 7)
data.set_stride_sample(data_split = 'val', stride_sample = 700)
# create list of files to extract data from
data.set_filelist(data_split = 'val')
# save numpy files of validation data
data.save_as_npy(data_split = 'val', save_path = '/global/homes/z/zeyuanhu/scratch/hugging/E3SM-MMF_ne4/preprocessing/v5_example/')

### Create test data

In [16]:
data.data_path = '/global/homes/z/zeyuanhu/scratch/hugging/E3SM-MMF_ne4/_test/'

data.set_to_v5_vars()

# set regular expressions for selecting validation data
data.set_regexps(data_split = 'test',
                 regexps = ['E3SM-MMF.mlexpand.0009-0[3456789]-*-*.nc', 
                            'E3SM-MMF.mlexpand.0009-1[012]-*-*.nc',
                            'E3SM-MMF.mlexpand.0010-*-*-*.nc',
                            'E3SM-MMF.mlexpand.0011-0[12]-*-*.nc'])
# set temporal subsampling
# data.set_stride_sample(data_split = 'test', stride_sample = 7)
data.set_stride_sample(data_split = 'test', stride_sample = 700)
# create list of files to extract data from
data.set_filelist(data_split = 'test')
# save numpy files of validation data
data.save_as_npy(data_split = 'test', save_path = '/global/homes/z/zeyuanhu/scratch/hugging/E3SM-MMF_ne4/preprocessing/v5_example/')

In [17]:
!ls /global/homes/z/zeyuanhu/scratch/hugging/E3SM-MMF_ne4/preprocessing/v5_example/

test_input.h5	test_target.npy  train_target.h5   val_input.npy
test_input.npy	train_input.h5	 train_target.npy  val_target.h5
test_target.h5	train_input.npy  val_input.h5	   val_target.npy
