YOLO-Object-Detection / yolo_loss.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class YoloLoss(nn.Module):
    def __init__(self,S,B,l_coord,l_noobj):
        self.S = S
        self.B = B
        self.l_coord = l_coord
        self.l_noobj = l_noobj

    def compute_iou(self, box1, box2):
        '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
          box1: (tensor) bounding boxes, sized [N,4].
          box2: (tensor) bounding boxes, sized [M,4].
          (tensor) iou, sized [N,M].
        N = box1.size(0)
        M = box2.size(0)

        lt = torch.max(
            box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]

        rb = torch.min(
            box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]

        wh = rb - lt  # [N,M,2]
        wh[wh<0] = 0  # clip at 0
        inter = wh[:,:,0] * wh[:,:,1]  # [N,M]

        area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
        area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
        area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
        area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]

        iou = inter / (area1 + area2 - inter)
        return iou
    def get_class_prediction_loss(self, classes_pred, classes_target):
        classes_pred : (tensor) size (batch_size, S, S, 20)
        classes_target : (tensor) size (batch_size, S, S, 20)

        class_loss : scalar
        ##### CODE #####
        class_loss = F.mse_loss(classes_pred, classes_target,  size_average=False, reduce=None, reduction='sum')
        return class_loss
    def get_regression_loss(self, box_pred_response, box_target_response):   
        box_pred_response : (tensor) size (-1, 5)
        box_target_response : (tensor) size (-1, 5)
        Note : -1 corresponds to ravels the tensor into the dimension specified 
        See : https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view_as

        reg_loss : scalar
        ##### CODE #####
        #print("regression shape",box_pred_response.shape)
        #print("regression shape",box_target_response.shape)
        reg_loss = F.mse_loss(box_pred_response[:, :2], box_target_response[:, :2], size_average=False, reduce=None, reduction='sum') + F.mse_loss(torch.sqrt(box_pred_response[:, 2:4]), torch.sqrt(box_target_response[:, 2:4]), size_average=None, reduce=None, reduction='sum')
        return reg_loss
    def get_contain_conf_loss(self, box_pred_response, box_target_response_iou):
        box_pred_response : (tensor) size ( -1 , 5)
        box_target_response_iou : (tensor) size ( -1 , 5)
        Note : -1 corresponds to ravels the tensor into the dimension specified 
        See : https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view_as

        contain_loss : scalar
        ##### CODE #####
        contain_loss = F.mse_loss(box_pred_response[:, 4], box_target_response_iou[:, 4], size_average=False, reduce=None, reduction='sum')
        return contain_loss
    def get_no_object_loss(self, target_tensor, pred_tensor, no_object_mask):
        target_tensor : (tensor) size (batch_size, S , S, 30)
        pred_tensor : (tensor) size (batch_size, S , S, 30)
        no_object_mask : (tensor) size (batch_size, S , S, 30)

        no_object_loss : scalar

        1) Create a 2 tensors no_object_prediction and no_object_target which only have the 
        values which have no object. 
        2) Have another tensor no_object_prediction_mask of the same size such that 
        mask with respect to both confidences of bounding boxes set to 1. 
        3) Create 2 tensors which are extracted from no_object_prediction and no_object_target using
        the mask created above to find the loss. 
        ##### CODE #####
        n_elements= self.B * 5 + 20
        for i in range(self.B):
            no_object_prediction_mask[:,i*5+4] = 1
        no_object_target_c = no_object_target[no_object_prediction_mask]  
        no_object_prediction_c = no_object_prediction[no_object_prediction_mask]
        no_object_loss= F.mse_loss(no_object_prediction_c, no_object_target_c, size_average=None, reduce=None, reduction='sum')
        return no_object_loss
    def find_best_iou_boxes(self, box_target, box_pred):
        box_target : (tensor)  size (-1, 5)
        box_pred : (tensor) size (-1, 5)
        Note : -1 corresponds to ravels the tensor into the dimension specified 
        See : https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view_as

        box_target_iou: (tensor)
        contains_object_response_mask : (tensor)

        1) Find the iou's of each of the 2 bounding boxes of each grid cell of each image.
        2) Set the corresponding contains_object_response_mask of the bounding box with the max iou
        of the 2 bounding boxes of each grid cell to 1.
        3) For finding iou's use the compute_iou function
        4) Before using compute preprocess the bounding box coordinates in such a way that 
        if for a Box b the coordinates are represented by [x, y, w, h] then 
        x, y = x/S - 0.5*w, y/S - 0.5*h ;
        w, h = x/S + 0.5*w, y/S + 0.5*h
        Note: Over here initially x, y are the center of the box and w,h are width and height. 
        We perform this transformation to convert the correct coordinates into bounding box coordinates.
        5) Set the confidence of the box_target_iou of the bounding box to the maximum iou
        ##### CODE #####
        contains_object_response_mask = torch.cuda.ByteTensor(box_target.size())
        no_contains_object_response_mask = torch.cuda.ByteTensor(box_target.size())
        box_target_iou = torch.zeros(box_target.size()).cuda()
        for i in range(0,box_target.size()[0],self.B):          
            box1 = box_pred[i:i+self.B]
            box2 = box_target[i:i+self.B]
            box1_xyxy = Variable(torch.FloatTensor(box1.size()))
            box2_xyxy = Variable(torch.FloatTensor(box2.size()))
            box1_xyxy[:,:2] = box1[:,:2]/14. -0.5*box1[:,2:4]
            box2_xyxy[:,:2] = box2[:,:2]/14. -0.5*box2[:,2:4]
            box1_xyxy[:,2:4] = box1[:,:2]/14. +0.5*box1[:,2:4]
            box2_xyxy[:,2:4] = box2[:,:2]/14. +0.5*box2[:,2:4]
            iou = self.compute_iou(box1_xyxy[:,:4],box2_xyxy[:,:4])
            max_iou, max_index = iou.max(0)
            max_index = max_index.data.cuda()
            box_target_iou[i+max_index,torch.LongTensor([4])]=(max_iou).data.cuda()   #box_target_iou setting it to the i+max_index -- second diemension should be 4  [I+max_index, 4]



        return box_target_iou, contains_object_response_mask
    def forward(self, pred_tensor,target_tensor):
        pred_tensor: (tensor) size(batchsize,S,S,Bx5+20=30)
                      where B - number of bounding boxes this grid cell is a part of = 2
                            5 - number of bounding box values corresponding to [x, y, w, h, c]
                                where x - x_coord, y - y_coord, w - width, h - height, c - confidence of having an object
                            20 - number of classes
        target_tensor: (tensor) size(batchsize,S,S,30)
        Total Loss
        N = pred_tensor.size()[0] #the batch size
        # Create 2 tensors contains_object_mask and no_object_mask 
        # of size (Batch_size, S, S) such that each value corresponds to if the confidence of having 
        # an object > 0 in the target tensor.
        ##### CODE #####
        n_elements = self.B * 5 + 20
        #target_tensor = target_tensor.view(N,-1,n_elements)
        #pred_tensor = pred_tensor.view(N,-1,n_elements)
        #print("target_tensor shape 2",target_tensor.shape)
        contains_object_mask=target_tensor[:, :, :,5]>0
        no_object_mask=target_tensor[:, :,:, 5]==0
        #print("size of contains object mask",contains_object_mask.shape)
        contains_object_mask = contains_object_mask.unsqueeze(-1).expand_as(target_tensor)
        no_object_mask = no_object_mask.unsqueeze(-1).expand_as(target_tensor)
        #print("size of contains object mask 2",contains_object_mask.shape)

        # Create a tensor contains_object_pred that corresponds to 
        # to all the predictions which seem to confidence > 0 for having an object
        # Split this tensor into 2 tensors :
        # 1) bounding_box_pred : Contains all the Bounding box predictions of all grid cells of all images
        # 2) classes_pred : Contains all the class predictions for each grid cell of each image
        # Hint : Use contains_object_mask
        ##### CODE #####  
        contains_object_target = target_tensor[contains_object_mask].view(-1,n_elements)
        contains_object_pred = pred_tensor[contains_object_mask].view(-1,n_elements)
        classes_pred = contains_object_pred[:,self.B*5:]
        classes_target = contains_object_target[:,self.B*5:]
        bounding_box_pred = contains_object_pred[:,:self.B*5].contiguous().view(-1,5)
        bounding_box_target = contains_object_target[:,:self.B*5].contiguous().view(-1,5)
        #print("shape of classes pred", classes_pred.shape)
        #print("shape of bounding box pred", bounding_box_pred.shape)
        # Similarly as above create 2 tensors bounding_box_target and
        # classes_target.
        ##### CODE #####
        #done above already       

        # Compute the No object loss here
        ##### CODE #####
        #def get_no_object_loss(self, target_tensor, pred_tensor, no_object_mask):
        #print("target_tensor shape before get no object",target_tensor.shape)
        #print("no_object_mask shape before get no object",no_object_mask.shape)
        no_object_loss=self.get_no_object_loss( target_tensor,  pred_tensor, no_object_mask)

        # Compute the iou's of all bounding boxes and
        # compute   the mask for which bounding box of 2 has the maximum iou the bounding boxes for each grid cell of each image.
        ##### CODE #####
        #find_best_iou_boxes(self, box_target, box_pred):
        box_target_iou, contains_object_response_mask=self.find_best_iou_boxes(bounding_box_target,bounding_box_pred)

        # Create 3 tensors :
        # 1) box_prediction_response - bounding box predictions for each grid cell which has the maximum iou
        # 2) box_target_response_iou - bounding box target ious for each grid cell which has the maximum iou
        # 3) box_target_response -  bounding box targets for each grid cell which has the maximum iou
        # Hint : Use contains_object_response_mask
        ##### CODE #####
        box_prediction_response=bounding_box_pred[contains_object_response_mask].view(-1, 5)
        box_target_response=bounding_box_target[contains_object_response_mask].view(-1, 5)
        box_target_response_iou=box_target_iou[contains_object_response_mask].view(-1, 5)
        box_target_iou = Variable(box_target_iou).cuda()
        box_target_response_iou = box_target_response_iou.detach()

        box_target_response = box_target_response.detach()
        box_target_response_iou = box_target_response_iou.detach()

        box_target_response = box_target_response.detach()
        # Find the class_loss, containing object loss and regression loss
        ##### CODE #####
        #get_regression_loss(self, box_pred_response, box_target_response): 
        #get_contain_conf_loss(self, box_pred_response, box_target_response_iou):
        #get_class_prediction_loss(self, classes_pred, classes_target):
        regression_loss=self.get_regression_loss(box_prediction_response, box_target_response)
        contain_loss=self.get_contain_conf_loss(box_prediction_response, box_target_response_iou)
        class_loss=self.get_class_prediction_loss(classes_pred, classes_target)
        #self.l_coord = l_coord
        #self.l_noobj = l_noobj
        total_loss=self.l_coord*regression_loss + contain_loss + self.l_noobj*no_object_loss + class_loss
        return total_loss