import math import torch import numpy as np from utils import CHART_TYPE_MAP, PAD_IDX from .base import ( BaseDataset, get_text_window, stack_dict, stack_tensor_dict, shift_right_vec, shift_tokens_right ) ''' Offset_cont mode predict == > [x0,x1] [x1-x0,x2-x1], [x2-x1,y2-y1] ... scale == > [mean(d1), std(d1), mean(d1), std(x1)] #Multi choice learners ''' class PmcDataDataset(BaseDataset): def __init__(self, data, tokenizer=None, chart_tasks=None, scale_mode='log10', scale_eps=1.00001, scale_floor=-1.0, window_size=16, widen_rate=0.0, max_widen_len=1, min_sent_len=10, max_source_len=1024, max_target_len=256, max_source_len2=8, max_target_len2=8, pad_to_max_len=True, ignore_pad_token_for_loss=True, norm_mode='minmax', discrete_input=None, tasks=['categorical','series_name','axis','caption','data'], sep_token='<SEP>', **kwargs): super().__init__(tokenizer, window_size, widen_rate, max_widen_len, min_sent_len, max_source_len, max_target_len, pad_to_max_len, ignore_pad_token_for_loss) # Filter the pmc dataset for certain tasks or charts self.chart_tasks = ['task6'] if chart_tasks is None else chart_tasks self.active_charts = [] self.tokenizer = tokenizer self.max_source_len2 = max_source_len if max_source_len2 is None else max_source_len2 self.max_target_len2 = max_target_len if max_target_len2 is None else max_target_len2 self.discrete_input = discrete_input self.sep_token = sep_token self.tasks = tasks self._data = data self.data = self.filter_data(data) assert scale_mode in ['log10', 'log'], "scale mode not implemented" self.scale_mode = math.log10 if scale_mode == 'log10' else math.log self.scale_eps = scale_eps self.scale_floor = scale_floor self.box_plot_keys = ['min', 'first_quartile', 'median', 'third_quartile', 'max'] self.node_map = {'pad': PAD_IDX, 'point': 1, 'eos': 0} self.del_list = [] assert norm_mode in ['minmax','offset'] self.norm_mode = norm_mode def filter_data(self, data): task_req = [] if isinstance(self.chart_tasks, list): assert 'task6' in self.chart_tasks task_req = self.chart_tasks data = [d for d in data if all(t in d and d[t] != None for t in task_req)] if len(self.active_charts): data = [d for d in data if d['task1']['output']['chart_type'].lower().strip() in self.active_charts] return data def __len__(self): return len(self.data) def get_data_with_idx(self, index): if isinstance(index, list): return [self.data[i % len(self.data)] for i in index] elif isinstance(index, int): return self.data[index % len(self.data)] else: raise ValueError("Invalid index given: {}".format(index)) def __getitem__(self, index): while True: d = self.data[index % len(self.data)] #Get context fig_id = d['fig_id'] context_start = d['fig_index'][fig_id][0] all_text = d['all_text'] captions = d['captions'] #Remove caption from text all_text = self.preprocess_all_text(all_text) caption_label, all_text, context_start = self.get_caption_label( captions, all_text, context_start) context = get_text_window( all_text, context_start, tgt_token=self.tgt_token, window_size=self.window_size, widen_rate=self.widen_rate, max_widen_len=self.max_widen_len, min_sent_len=self.min_sent_len) ###################### # Prepare inputs ###################### input_text = '' if 'captions' in self.discrete_input: input_text += caption_label if 'context' in self.discrete_input: input_text += context outputs = {} context_inputs = None if self.tokenizer is not None: context_inputs, _ = self._tokenize(self.tokenizer, input_text, max_source_len=self.max_source_len, max_target_len=self.max_target_len, is_target=False) outputs['context'] = context_inputs outputs['chart_type'] = self.get_chart_type(d) outputs['chart_data'] = self.preprocess_data_series(d) if self.enforce_data_to_chart_type(outputs): break else: index += 1 return index, outputs def collate_fn(self, list_batch): list_idx = [b[0] for b in list_batch] list_batch = [b[1] for b in list_batch] collated_batch = stack_dict(list_batch) if self.tokenizer is not None: collated_batch['context'] = self.batch_tokens(collated_batch['context']) collated_batch['chart_data'] = self.collate_data(collated_batch['chart_data']) return list_idx, collated_batch def batch_tokens(self, list_batch): dict_of_tokens = stack_dict(list_batch) for key in list(dict_of_tokens.keys()): dict_of_tokens[key] = torch.cat(dict_of_tokens[key], dim=0) return dict_of_tokens def get_chart_type(self, d): '''Returns an index''' ct = d['task1']['output']['chart_type'].lower().strip() return CHART_TYPE_MAP.index(ct) def preprocess_data_series(self, d): output = {} output['unnorm_series'] = [] output['norm_series'] = [] output['unnorm_scale'] = {} output['chart_type'] = {} output['data_type'] = {} chart_type = d['task1']['output']['chart_type'].lower().strip() output['chart_type'] = chart_type #For normalising the data for s in ['y', 'x']: output['unnorm_scale'][s] = {} if self.norm_mode == 'offset': output['unnorm_scale'][s]['all'] = [] #Used to compute mean and scales elif self.norm_mode == 'minmax': output['unnorm_scale'][s]['min'] = [float('inf')] * len(d['task6']['output']['data series']) output['unnorm_scale'][s]['max'] = [-float('inf')] * len(d['task6']['output']['data series']) else: raise NotImplementedError() #First loop to collect data from json. data is organised for scaling later first_v = {} #This goes through the series for ds_idx, series_data in enumerate(d['task6']['output']['data series']): unnorm_series = {} unnorm_series['name'] = None unnorm_series['data'] = [] if 'unnamed' not in series_data['name']: unnorm_series['name'] = series_data['name'] prev_v = {} #This goes through each data point for pt_idx, data in enumerate(series_data['data']): series_keys = list(data.keys()) #ignore series that do not have 'x', 'y' or 'y2' if ('x' not in series_keys) or \ ('y' not in series_keys and \ 'y2' not in series_keys): continue #Replace y2 with y where possible y2_replacement_flag = False if 'y2' in series_keys and 'y' not in series_keys and 'x' in series_keys: y2_replacement_flag = True pt_store = {} #This goes through each property of each data point for _, (k, v) in enumerate(data.items()): #Replace y2 with y where possible if 'y2-' in k: continue if y2_replacement_flag and k == 'y2': k = 'y' data_type = type(v) output['data_type'][k] = data_type #Remove spaces if data_type == str: pt_store[k] = v.replace('\n',' ') #Store the data AND update max, min for each series elif isinstance(v, (float, int)): # Assign keys to each grouping if k in ['y', 'y2', 'first_quartile', 'min', 'max', 'third_quartile', 'median']: scale_key = 'y' elif 'x' == k: scale_key = 'x' else: raise ValueError("Unsupported axis key: {}".format(k)) #Use offset mode (mean,std) or not min,range if self.norm_mode == 'minmax': if v < -self.scale_eps[0]: v = 0 pt_store[k] = v output['unnorm_scale'][scale_key]['min'][ds_idx] = min(output['unnorm_scale'][scale_key]['min'][ds_idx], v) output['unnorm_scale'][scale_key]['max'][ds_idx] = max(output['unnorm_scale'][scale_key]['max'][ds_idx], v) elif self.norm_mode == 'offset': pt_store[k] = v output['unnorm_scale'][scale_key]['all'].append(v) else: raise NotImplementedError(self.norm_mode) if len(pt_store) > 1: unnorm_series['data'].append(pt_store) keys = list(pt_store.keys()) assert True if 'y' in keys else 'y2' not in keys, "{}, {}, {}".format(series_keys, keys, y2_replacement_flag) output['unnorm_series'].append(unnorm_series) ########################################## #Second loop to calculate statistics on collected data and then normalise data. e.g. min max or mean std output['norm_scale'] = {} #Compute the mean and std if self.norm_mode == 'offset': for s in list(output['unnorm_scale'].keys()): all_data = np.array(output['unnorm_scale'][s]['all']) if len(all_data): d_mean = np.mean(all_data) d_std = np.std(all_data) #Clip means lower than eps d_mean = max(d_mean, -self.scale_eps[0] + 0.0001) scale_mean = self.scale_mode(d_mean + self.scale_eps[0]) scale_std = self.scale_mode(d_std + self.scale_eps[1]) output['norm_scale'][s] = { 'mean': d_mean, 'std': d_std, 'scale_mean': scale_mean, 'scale_std': scale_std, } elif self.norm_mode == 'minmax': for s in ['y', 'x']: series_count = len(output['unnorm_scale'][s]['min']) min_container, max_container = [],[] for ds_idx in range(series_count): if output['unnorm_scale'][s]['min'][ds_idx] < float('inf') and \ output['unnorm_scale'][s]['max'][ds_idx] > -float('inf'): # and \ #output['unnorm_scale'][s]['min'][ds_idx] != output['unnorm_scale'][s]['max'][ds_idx]: if s not in output['norm_scale']: output['norm_scale'][s] = {} for n in ['min','max','range']: output['norm_scale'][s][n] = [None] * series_count output['norm_scale'][s]['min'][ds_idx] = self.scale_mode(output['unnorm_scale'][s]['min'][ds_idx] + self.scale_eps[0]) output['norm_scale'][s]['max'][ds_idx] = self.scale_mode(output['unnorm_scale'][s]['max'][ds_idx] + self.scale_eps[1]) min_container.append(output['unnorm_scale'][s]['min'][ds_idx]) max_container.append(output['unnorm_scale'][s]['max'][ds_idx]) #Calculate the scale before the floor to prevent out of domain errors scale_range = output['unnorm_scale'][s]['max'][ds_idx] - output['unnorm_scale'][s]['min'][ds_idx] output['norm_scale'][s]['range'][ds_idx] = self.scale_mode(scale_range + self.scale_eps[1]) output['norm_scale'][s]['min'][ds_idx] = max(output['norm_scale'][s]['min'][ds_idx], self.scale_floor[0]) output['norm_scale'][s]['range'][ds_idx] = max(output['norm_scale'][s]['range'][ds_idx], self.scale_floor[1]) #Check if any is none, then just average the rest for ds_idx in range(series_count): if s in output['norm_scale']: if output['norm_scale'][s]['min'][ds_idx] is None: avg_min = np.array(min_container).mean() avg_max = np.array(max_container).mean() avg_range = avg_max - avg_min #output['norm_scale'][s]['max'][ds_idx] = self.scale_mode(avg_max + self.scale_eps[0]) output['norm_scale'][s]['min'][ds_idx] = max(self.scale_mode(avg_min + self.scale_eps[0]), self.scale_floor[0]) output['norm_scale'][s]['range'][ds_idx] = max(self.scale_mode(avg_range + self.scale_eps[1]), self.scale_floor[1]) for ds_idx, series in enumerate(output['unnorm_series']): norm_series = {} norm_series['name'] = series['name'] norm_series['data'] = [] for pt_idx, x_data in enumerate(series['data']): norm_x = {} for _, (k, v) in enumerate(x_data.items()): if isinstance(v, (float, int)): if k in ['y', 'y2', 'first_quartile', 'min', 'max', 'third_quartile', 'median']: scale_key = 'y' elif 'x' == k: scale_key = 'x' else: raise NotImplementedError(k) if self.norm_mode == 'minmax': min_val = output['unnorm_scale'][scale_key]['min'][ds_idx] minmax = output['unnorm_scale'][scale_key]['max'][ds_idx] - output['unnorm_scale'][scale_key]['min'][ds_idx] v = (v - min_val) / minmax if minmax > 0 else 0 elif self.norm_mode == 'offset': # if ds_idx == 0 and pt_idx == 0 and k in ['min', 'x', 'y', 'y2']: # v = 0.0 # else: mean = output['norm_scale'][scale_key]['mean'] std = output['norm_scale'][scale_key]['std'] v = (v - mean) / std if std > 0 else 0.0 norm_x[k] = v if len(norm_x): norm_series['data'].append(norm_x) if len(norm_series['data']): output['norm_series'].append(norm_series) return output def enforce_data_to_chart_type(self, outputs): flag = False if len(outputs['chart_data']['norm_series']) == 0 or len(outputs['chart_data']['norm_series'][0]['data']) == 0: flag = False elif len(outputs['chart_data']['norm_scale']) == 0: flag = False elif outputs['chart_data']['chart_type'] == 'vertical box': flag = True elif outputs['chart_data']['chart_type'] in ['line', 'scatter'] and len(outputs['chart_data']['norm_scale']) == 2: flag = all(v==float for k, v in outputs['chart_data']['data_type'].items()) elif outputs['chart_data']['chart_type'] in ['vertical bar', 'horizontal bar'] and len(outputs['chart_data']['norm_scale']) == 1: flag = True if outputs['chart_data']['data_type']['x'] == str else False else: flag = False #raise ValueError("chart type not recognized: {}".format(outputs['data']['chart_type'])) return flag def collate_captions(self, list_batch): collated_batch = stack_dict(list_batch) for key in list(collated_batch.keys()): if isinstance(collated_batch[key], torch.Tensor): collated_batch[key] = torch.stack(collated_batch[key], dim=0) return collated_batch def tokenize_text_batch(self, padded_batch_txt_tgt, depth=3): all_txt_inputs, all_txt_labels = {},{} for padded_series_txt_tgt in padded_batch_txt_tgt: length = len(padded_series_txt_tgt) collector = [] for txt_raw in padded_series_txt_tgt: if depth == 2: collector += [txt_raw] if len(collector) < length: continue else: txt_raw = collector txt_inputs, txt_labels = self._tokenize( self.tokenizer, txt_raw, max_source_len=self.max_source_len2, max_target_len=self.max_target_len2, is_target=True, shift_right=True) for k, v in txt_inputs.items(): if k not in all_txt_inputs: all_txt_inputs[k] = [] all_txt_inputs[k] += [v] for k, v in txt_labels.items(): if k not in all_txt_labels: all_txt_labels[k] = [] all_txt_labels[k] += [v] #all_txt_inputs = stack_tensor_dict(all_txt_inputs) #all_txt_labels = stack_tensor_dict(all_txt_labels) ################################################ #Pad and batch #def pad_n_batch() label = all_txt_labels['input_ids'] col_counts = [l.size(0) for l in label] max_token_len = label[0].size(1) padded_labels = [] for lbl in label: pad_len = max(col_counts) - lbl.size(0) if pad_len > 0: pad = torch.ones((pad_len, max_token_len), dtype=torch.int32) * self.tokenizer.pad_token_id lbl = torch.cat([lbl, pad], dim=0) padded_labels.append(lbl) label = torch.stack(padded_labels, dim=0) label[label == self.tokenizer.pad_token_id] = PAD_IDX all_txt_labels['input_ids'] = label return all_txt_inputs, all_txt_labels def collate_data(self, batch_data): batch_chart_type = [] batch_node_type = [] batch_node_mask = [] batch_reg_targets = [] batch_reg_mask = [] batch_text_targets = [] batch_text_mask = [] max_node_len = 0 max_series_len = 0 batch_name_targets = [] batch_name_mask = [] batch_scale_tgt = [] max_series_len = max(len(d['norm_series']) for d in batch_data) for data in batch_data: series_node_type = [] series_node_mask = [] series_reg_tgt = [] series_reg_mask = [] series_txt_tgt = [] series_txt_mask = [] series_name_tgt = [] series_name_mask = [] series_scale_tgt = [] chart_type = data['chart_type'] batch_chart_type.append(chart_type) for s_idx, series in enumerate(data['norm_series']): node_type, node_mask = [], [] reg_targets, reg_mask = [], [] txt_tgt, txt_mask = [], [] #TARGET 1 Scales scale_flag = True scale_tgt = [] for s in ['x', 'y']: if s in data['norm_scale']: if self.norm_mode == 'minmax': scale_tensor = torch.tensor([data['norm_scale'][s]['min'][s_idx], data['norm_scale'][s]['range'][s_idx]], dtype=torch.float32) elif self.norm_mode == 'offset': scale_tensor = torch.tensor([data['norm_scale'][s]['scale_mean'], data['norm_scale'][s]['scale_std']], dtype=torch.float32) scale_tgt += scale_tensor if None in scale_tgt: scale_flag = False break #Check scales are correct. If not remove the series if not scale_flag: continue if len(scale_tgt): scale_tgt = torch.stack(scale_tgt, dim=-1).view(1, -1) series_scale_tgt += scale_tgt #TARGET 2: Series name = 'asdf' if series['name'] is not None: series_name_tgt.append(series['name']) series_name_mask.append(1) else: series_name_tgt.append('') series_name_mask.append(0) #TARGET 3: Sequence of points ( node type) prev_pt = None for pidx, point in enumerate(series['data']): #TARGET 4: Regression data if data['chart_type'] == 'vertical box': # Original: [min, first-quartile, median, third-quartile, max] # Prediction: [min_val, first_to_min, median_to_min, third_to_first, max_to_third] # Prediction head must contain relu final layer min_val = point['min'] first_to_min = point['first_quartile'] - point['min'] median_to_min = point['median'] - point['min'] third_to_first = point['third_quartile'] - point['first_quartile'] max_to_third = point['max'] - point['third_quartile'] reg_tgt = [min_val, first_to_min, median_to_min, third_to_first, max_to_third] elif data['chart_type'] in ['vertical bar', 'horizontal bar']: if pidx == 0 or self.norm_mode == 'minmax': reg_tgt = [point[k] for k in ['y']] else: reg_tgt = [point[k] - prev_pt[k] for k in ['y']] elif data['chart_type'] in ['line', 'scatter']: if pidx == 0 or self.norm_mode == 'minmax': reg_tgt = [point[k] for k in ['x', 'y']] else: reg_tgt = [point[k] - prev_pt[k] for k in ['x', 'y']] else: raise NotImplementedError("Invalid chart given") #save for offsetting prev_pt = point node_type.append(self.node_map['point']) node_mask.append(1) #TARGET 5: TEXT data if data['chart_type'] not in ['line', 'scatter'] and \ point.get('x') is not None and \ len(point.get('x')) and \ isinstance(point.get('x'), str): txt_tgt.append(point['x']) txt_mask.append(1) else: txt_tgt.append('') txt_mask.append(0) reg_targets.append(reg_tgt) reg_mask.append(1) node_type.append(self.node_map['eos']) node_mask.append(1) reg_len = len(reg_targets[-1]) reg_targets.append([0.] * reg_len) reg_mask.append(0) series_node_type += [node_type] series_node_mask += [node_mask] series_reg_tgt += [reg_targets] series_reg_mask += [reg_mask] series_txt_tgt += [txt_tgt] series_txt_mask += [txt_mask] #Ensure all are the same length cur_node_len = len(node_type) for idx, l in enumerate([node_type, node_mask, reg_targets, reg_mask]): assert len(l) == cur_node_len, "l={} idx={} cur_node_len={}".format(l, idx, cur_node_len) if cur_node_len > max_node_len: max_node_len = cur_node_len batch_node_type += [series_node_type] batch_node_mask += [series_node_mask] batch_reg_targets += [series_reg_tgt] batch_reg_mask += [series_reg_mask] batch_text_targets += [series_txt_tgt] batch_text_mask += [series_txt_mask] batch_name_targets += [series_name_tgt] batch_name_mask += [series_name_mask] #Stacks by series if self.norm_mode == 'minmax': batch_scale_tgt += [torch.stack(series_scale_tgt, dim=0)] elif self.norm_mode == 'offset': batch_scale_tgt += [series_scale_tgt[0]] padded_batch_node_type = [] padded_batch_node_mask = [] padded_batch_reg_tgt = [] padded_batch_reg_mask = [] padded_batch_txt_tgt = [] padded_batch_scale_tgt = [] padded_batch_scale_mask = [] #Padding to ensure whole batch is same length for series_node_type, series_node_mask, series_reg_tgt, series_reg_mask, series_txt_tgt, series_txt_mask, series_scale_tgt in zip( batch_node_type, batch_node_mask, batch_reg_targets, batch_reg_mask , batch_text_targets, batch_text_mask, batch_scale_tgt ): padded_series_node_type = [] padded_series_node_mask = [] padded_series_reg_tgt = [] padded_series_reg_mask = [] padded_series_txt_tgt = [] padded_series_txt_mask = [] #max_token_len = 0 ### Pad by column for idx, (node_type, node_mask, reg_tgt, reg_mask, txt_tgt, txt_mask) in enumerate(zip( series_node_type, series_node_mask, series_reg_tgt, \ series_reg_mask, series_txt_tgt, series_txt_mask )): pad_node_len = max_node_len - len(node_type) assert sum(reg_mask) > 0, "must be more than zero" if pad_node_len > 0: node_type += [self.node_map['pad']] * pad_node_len node_mask += [0] * pad_node_len mask_pad = [0] * pad_node_len reg_mask += mask_pad txt_mask += mask_pad reg_len = len(reg_tgt[-1]) reg_pad = [[0.] * reg_len for _ in range(pad_node_len)] reg_tgt += reg_pad padded_series_node_type += [node_type] padded_series_node_mask += [node_mask] padded_series_reg_tgt += [reg_tgt] padded_series_reg_mask += [reg_mask] if idx == 0: padded_series_txt_tgt += [txt_tgt] padded_series_txt_mask += [txt_mask] #categorical text data is always the same, except the series name # txt_tgt = ['text1', 'text2', '', ''] # txt_tgt_ids = [[2,3,4,0], [1,2,0,0], [1,0,0,0], [1,0,0,0]] #max_token_len = max(max_token_len, len(txt_tgt_ids[0])) pad_series_len = max_series_len - len(padded_series_node_type) pad_node_len = max(len(p) for p in padded_series_node_type) reg_len = len(padded_series_reg_tgt[0][0]) padded_series_reg_mask = torch.tensor(padded_series_reg_mask, dtype=torch.int32) padded_series_node_mask = torch.tensor(padded_series_node_mask, dtype=torch.int32) assert padded_series_reg_mask.sum() > 0, "mask is zero" for _ in range(pad_series_len): padded_series_node_type += [[self.node_map['eos']] + [self.node_map['pad']] * (pad_node_len-1)] reg_pad = [[0.] * reg_len for _ in range(pad_node_len)] padded_series_reg_tgt += [reg_pad] if pad_series_len > 0 and pad_node_len > 0: pad_mask = torch.zeros((pad_series_len, pad_node_len), dtype=torch.long) padded_series_reg_mask = torch.cat([padded_series_reg_mask, pad_mask], dim=0) padded_series_node_mask = torch.cat([padded_series_node_mask, pad_mask], dim=0) ##################################### # SCALE PADDING #Pad the scales and make a mask series_scale_mask = None if self.norm_mode == 'minmax': series_len, scale_dim = series_scale_tgt.shape series_scale_mask = torch.ones((series_len), dtype=torch.int32) pad_len = max_series_len - series_len if pad_len > 0: pad_scale_tgt = torch.zeros((pad_len, scale_dim), dtype=torch.int32) series_scale_tgt = torch.cat([series_scale_tgt, pad_scale_tgt], dim=0) pad_scale_mask = torch.zeros((pad_len), dtype=torch.int32) series_scale_mask = torch.cat([series_scale_mask, pad_scale_mask], dim=0) padded_series_reg_tgt = torch.tensor(padded_series_reg_tgt, dtype=torch.float32) padded_batch_node_type += [padded_series_node_type] padded_batch_node_mask += [padded_series_node_mask] padded_batch_reg_tgt += [padded_series_reg_tgt] padded_batch_reg_mask += [padded_series_reg_mask] padded_batch_txt_tgt += [padded_series_txt_tgt] padded_batch_scale_tgt += [series_scale_tgt] padded_batch_scale_mask += [series_scale_mask] #Shift right all sequence to sequence problems (text, name) if self.tokenizer is not None: txt_inputs, txt_labels = self.tokenize_text_batch(padded_batch_txt_tgt) name_inputs, name_labels = self.tokenize_text_batch(batch_name_targets, depth=2) #Stack non sequence problems inputs = {} inputs['chart_type'] = batch_chart_type #### Continuous data (Can be different dimension depending on chart type. cannot stack these.) inputs['scale'] = {} inputs['scale']['inputs_embeds'] = padded_batch_scale_tgt #batch_scale_tgt #torch.stack(batch_scale_tgt, dim=0) inputs['scale']['attention_mask'] = padded_batch_scale_mask #batch_scale_tgt #torch.stack(batch_scale_tgt, dim=0) if self.norm_mode == 'minmax': inputs['scale']['decoder_inputs_embeds'] = [shift_right_vec(inp, start_values=0.0) for inp in inputs['scale']['inputs_embeds']] inputs['scale']['decoder_attention_mask'] = [shift_right_vec(inp, start_values=0.0) for inp in inputs['scale']['attention_mask']] inputs['continuous'] = {} inputs['continuous']['inputs_embeds'] = padded_batch_reg_tgt # torch.stack(padded_batch_reg_tgt, dim=0) attn_mask = torch.stack(padded_batch_reg_mask, dim=0) assert attn_mask.sum() > 0, "must be atleast reg :{}".format(attn_mask) inputs['continuous']['attention_mask'] = attn_mask inputs['continuous']['decoder_inputs_embeds'] = [shift_right_vec(inp, start_values=0.0) for inp in inputs['continuous']['inputs_embeds']] inputs['continuous']['decoder_attention_mask'] = [shift_right_vec(inp, start_values=0.0) for inp in inputs['continuous']['attention_mask']] #### Discrete data if self.tokenizer is not None: inputs['categorical'] = txt_inputs inputs['categorical']['label'] = txt_labels inputs['categorical']['raw'] = padded_batch_txt_tgt inputs['series_name'] = name_inputs inputs['series_name']['label'] = name_labels inputs['series_name']['raw'] = batch_name_targets inputs['node'] = {} inputs['node']['input_ids'] = torch.tensor(padded_batch_node_type, dtype=torch.int32) inputs['node']['attention_mask'] = torch.stack(padded_batch_node_mask, dim=0) #torch.tensor(padded_batch_node_mask, dtype=torch.long) #inputs['node']['decoder_input_ids'] = torch.stack([shift_tokens_right(inp, self.node_map['pad'], dim=-1) for inp in inputs['node']['input_ids']], dim=0) inputs['labels'] = {} inputs['labels']['col'] = inputs['node']['input_ids'][:,0,:] inputs['labels']['row'] = inputs['node']['input_ids'][:,:,0] return inputs