绑定框回归偏移的编码和解码版本不同

如何解决绑定框回归偏移的编码和解码版本不同

我正在尝试复制给定here的fast-rcnn中使用的边界框回归技术。我做了一个解码功能和一个编码功能。理想情况下，当将边界框传递给编码器然后对其进行解码时，我应该获得相同的边界框。

这是我的输入边界框：

import numpy as np
import tensorflow as tf

def make_anchors(img_size,conv_h,conv_w,scale,aspect_ratios):
    prior_data = []
    # Iteration order is important (it has to sync up with the convout)
    for j,i in product(range(conv_h),range(conv_w)):
        # + 0.5 because priors are in center
        x = (i + 0.5) / conv_w
        y = (j + 0.5) / conv_h

        for ar in aspect_ratios:
            ar = sqrt(ar)
            w = scale * ar / img_size
            h = scale / ar / img_size
            
            prior_data += [x,y,w,h]

    return prior_data

test_bBox = tf.convert_to_tensor((np.array([[204.044,253.8351,487.8226,427.06363],[0,140.01741,550,290.21936],[40.005028,117.37102,255.7913,205.13097],[263.31314,67.0434,514.04736,124.48139],503.79834,487.0279,550]])),dtype=tf.float32)

test_labels = tf.convert_to_tensor((np.array([[1],[2],[3],[4],[5]])),dtype=tf.float32)

feature_map_size=[[69,69],[35,35],[18,18],[9,9],[5,5]]
aspect_ratios=[1,0.5,2]
scales=[24,48,96,192,384]
anchors = []

for i,shape in enumerate(feature_map_size):
    anchors += make_anchors(550,shape[0],shape[1],scales[i],aspect_ratios)
    
anchors = tf.reshape(tf.convert_to_tensor(anchors),[-1,4])

我使用550x550的图像尺寸作为输入，并相应地计算了特征图尺寸。

编码如下：

def encode(map_loc,center_anchors,include_variances=False):
    # center_gt = tf.map_fn(lambda x: map_to_center_form(x),map_loc)
    h = map_loc[:,2] - map_loc[:,0]
    w = map_loc[:,3] - map_loc[:,1]
    center_gt = tf.cast(tf.stack([map_loc[:,1] + (w / 2),map_loc[:,0] + (h / 2),h],axis=-1),tf.float32)
    variances = [0.1,0.2]

    # calculate offset
    if include_variances:
        g_hat_cx = (center_gt[:,0] - center_anchors[:,0]) / center_anchors[:,2] / variances[0]
        g_hat_cy = (center_gt[:,1] - center_anchors[:,1]) / center_anchors[:,3] / variances[0]
    else:
        g_hat_cx = (center_gt[:,2]
        g_hat_cy = (center_gt[:,3]
    tf.debugging.assert_non_negative(center_anchors[:,2] / center_gt[:,2])
    tf.debugging.assert_non_negative(center_anchors[:,3] / center_gt[:,3])
    if include_variances:
        g_hat_w = tf.math.log(center_gt[:,2] / center_anchors[:,2]) / variances[1]
        g_hat_h = tf.math.log(center_gt[:,3] / center_anchors[:,3]) / variances[1]
    else:
        g_hat_w = tf.math.log(center_gt[:,2])
        g_hat_h = tf.math.log(center_gt[:,3])
    offsets = tf.stack([g_hat_cx,g_hat_cy,g_hat_w,g_hat_h],axis=-1)
    return offsets

def area(Boxlist,scope=None):
    # https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/Box_list_ops.py#L48
    """Computes area of Boxes.
    Args:
    Boxlist: BoxList holding N Boxes
    scope: name scope.
    Returns:
    a tensor with shape [N] representing Box areas.
    """
    y_min,x_min,y_max,x_max = tf.split(
        value=Boxlist,num_or_size_splits=4,axis=1)
    return tf.squeeze((y_max - y_min) * (x_max - x_min),[1])

def intersection(Boxlist1,Boxlist2,scope=None):
    # https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/Box_list_ops.py#L209
    """Compute pairwise intersection areas between Boxes.
    Args:
    Boxlist1: BoxList holding N Boxes
    Boxlist2: BoxList holding M Boxes
    scope: name scope.
    Returns:
    a tensor with shape [N,M] representing pairwise intersections
    """
    y_min1,x_min1,y_max1,x_max1 = tf.split(
        value=Boxlist1,axis=1)
    y_min2,x_min2,y_max2,x_max2 = tf.split(
        value=Boxlist2,axis=1)
    all_pairs_min_ymax = tf.minimum(y_max1,tf.transpose(y_max2))
    all_pairs_max_ymin = tf.maximum(y_min1,tf.transpose(y_min2))
    intersect_heights = tf.maximum(0.0,all_pairs_min_ymax - all_pairs_max_ymin)
    all_pairs_min_xmax = tf.minimum(x_max1,tf.transpose(x_max2))
    all_pairs_max_xmin = tf.maximum(x_min1,tf.transpose(x_min2))
    intersect_widths = tf.maximum(0.0,all_pairs_min_xmax - all_pairs_max_xmin)
    return intersect_heights * intersect_widths

def IoU(Boxlist1,scope=None):
    # https://github.com/tensorflow/models/blob/831281cedfc8a4a0ad7c0c37173963fafb99da37/official/vision/detection/utils/object_detection/Box_list_ops.py#L259
    """Computes pairwise intersection-over-union between Box collections.
    Args:
    Boxlist1: BoxList holding N Boxes
    Boxlist2: BoxList holding M Boxes
    scope: name scope.
    Returns:
    a tensor with shape [N,M] representing pairwise IoU scores.
    """
    intersections = intersection(Boxlist1,Boxlist2)
    areas1 = area(Boxlist1)
    areas2 = area(Boxlist2)
    unions = (
        tf.expand_dims(areas1,1) + tf.expand_dims(areas2,0) - intersections)
    return tf.where(
        tf.equal(intersections,0.0),tf.zeros_like(intersections),tf.truediv(intersections,unions))

def matching(pos_thresh,neg_thresh,gt_bBox,gt_labels,priors):
    pairwise_IoU = IoU(priors,gt_bBox) # # size: [num_objects,num_priors]; anchors along the row and ground_truth clong the columns

    each_prior_max = tf.reduce_max(pairwise_IoU,axis=-1) # size [num_priors]; IoU with ground truth with the anchors
    each_prior_index = tf.math.argmax(pairwise_IoU,axis=-1) # size [num_priors]; id of groud truth having max IoU with the anchors

    each_Box_max = tf.reduce_max(pairwise_IoU,axis=0)
    each_Box_index = tf.math.argmax(pairwise_IoU,axis=0)

    # For the max IoU prior for each gt Box,set its IoU to 2. This ensures that it won't be filtered
    # in the threshold step even if the IoU is under the negative threshold. This is because that we want
    # at least one prior to match with each gt Box or else we'd be wasting training data.

    indices = tf.expand_dims(each_Box_index,axis=-1)

    updates = tf.cast(tf.tile(tf.constant([2]),each_Box_index.shape),dtype=tf.float32)
    each_prior_max = tf.tensor_scatter_nd_update(each_prior_max,indices,updates)

    # Set the index of the pair (prior,gt) we set the overlap for above.
    updates = tf.cast(tf.range(0,dtype=tf.int64)
    each_prior_index = tf.tensor_scatter_nd_update(each_prior_index,updates)

    each_prior_Box = tf.gather(gt_bBox,each_prior_index) # size: [num_priors,4]
    conf = tf.squeeze(tf.gather(gt_labels,each_prior_index) + 1) # the class of the max IoU gt Box for each prior,size: [num_priors]


    neutral_label_index = tf.where(each_prior_max < pos_thresh)
    background_label_index = tf.where(each_prior_max < neg_thresh)

    conf = tf.tensor_scatter_nd_update(conf,neutral_label_index,-1*tf.ones(tf.size(neutral_label_index)))
    conf = tf.tensor_scatter_nd_update(conf,background_label_index,tf.zeros(tf.size(background_label_index)))

    offsets = encode(each_prior_Box,priors)

    return offsets,conf,each_prior_Box,each_prior_index


offsets,each_prior_index = \
    matching(0.5,test_bBox/550,test_labels,anchors)

如果我尝试重绘编码后得到的偏移量，则会得到如下图像：


def _decode(Box_p,priors,include_variances=False):
    # https://github.com/feiyuhuahuo/Yolact_minimal/blob/9299a0cf346e455d672fadd796ac748871ba85e4/utils/Box_utils.py#L151
    """
    Decode predicted bBox coordinates using the scheme
    employed at https://lilianweng.github.io/lil-log/2017/12/31/object-recognition-for-dummies-part-3.html
        b_x = prior_w*loc_x + prior_x
        b_y = prior_h*loc_y + prior_y
        b_w = prior_w * exp(loc_w)
        b_h = prior_h * exp(loc_h)
    
    Note that loc is inputed as [c_x,x_y,h]
    while priors are inputed as [c_x,c_y,h] where each coordinate
    is relative to size of the image.
    
    Also note that prior_x and prior_y are center coordinates.
    """
    variances = [0.1,0.2]
    Box_p = tf.cast(Box_p,tf.float32)
    priors = tf.cast(priors,tf.float32)
    if include_variances:
        b_x_y = priors[:,:2] + Box_p[:,:2] * priors[:,2:]* variances[0]
        b_w_h = priors[:,2:] * tf.math.exp(Box_p[:,2:]* variances[1])
    else:
        b_x_y = priors[:,2:]
        b_w_h = priors[:,2:])
    
    Boxes = tf.concat([b_x_y,b_w_h],axis=1)
    
    # [x_min,y_min,x_max,y_max]
    Boxes = tf.concat([Boxes[:,:2] - Boxes[:,2:] / 2,Boxes[:,2:] + Boxes[:,:2]],axis=1)
    
    # [y_min,x_max]
    return tf.transpose(tf.stack([Boxes[:,1],0],3],2]]))


_idx = tf.where(conf > 0.5)
_test = _decode(offsets,anchorobj.anchors)
_out = tf.squeeze(tf.gather(_test,_idx)).numpy()*550
img_test = 255*np.ones((1000,1000,3),dtype=np.int8)

for Box in _out:
    Box = np.round(Box).astype(int)
    image = cv2.rectangle(img_test,(Box[1],Box[0]),(Box[3],Box[2]),(0,255,0),2)
plt.imshow(image)

正如人们所看到的那样，输出越过550的图像输入尺寸。为什么会发生这种情况？

解决方法

问题出在我的decode函数计算[x_min,y_min,x_max,y_max]时。应该是这样的：

# [x_min,y_max]
    boxes = tf.concat([boxes[:,:2] - boxes[:,2:] / 2,boxes[:,2:] / 2 + boxes[:,:2]],axis=1)

绑定框回归偏移的编码和解码版本不同

如何解决绑定框回归偏移的编码和解码版本不同

解决方法

相关推荐