Mask R-CNN在源码中添加ROI Align层

RoI ( Region of Interest ), 即为原图中的Region Proposals映射到特征图( feature map )上的框。Fast R-CNN中首次提出的ROI Pooling，由于在下采样过程中两次取整量化的操作，使得RoI有些许的位置偏差。因而在Mask R-CNN中提出了ROI Align，通过使用双线性插值计算小数坐标位置的值，结果更加精确。

添加文件

需要添加的文件目录

./faster-mask/caffe

	    |--./include/caffe/layers/roi_align_layer.hpp

	    |--./src/caffe/layers/roi_align_layer.cpp

	    |--./src/caffe/layers/roi_align_layer.cu

	    |--./src/caffe/test/test_roi_align_layer.cpp

头文件

./caffe目录下，include文件夹下放的都是层的.hpp头文件，前缀名与src中层相对应
头文件要包含定义层中使用的类变量和类型，声明使用的函数

roi_align_layer.hpp

#ifndef CAFFE_ROI_POOLING_LAYER_HPP_
#define CAFFE_ROI_POOLING_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {
template <typename Dtype>
class ROIAlignLayer : public Layer<Dtype> {
 public:
  explicit ROIAlignLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "ROIAlign"; }

  virtual inline int MinBottomBlobs() const { return 2; }
  virtual inline int MaxBottomBlobs() const { return 2; }
  virtual inline int MinTopBlobs() const { return 1; }
  virtual inline int MaxTopBlobs() const { return 1; }

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  int channels_;
  int height_;
  int width_;
  int pooled_height_;
  int pooled_width_;
  Dtype spatial_scale_;
  //====ROIAlign special param====
  Blob<Dtype> max_mult_;
  Blob<int> max_pts_;
  //==============================
};
}  // namespace caffe

#endif  // CAFFE_ROI_Align_LAYER_HPP_

源文件

/caffe目录下，src文件夹下放的都是层的.cpp源文件，前缀名即为层的名字
每个层的源码一般要包括.cpp和.cu文件，.cu文件是在gpu上运行时执行的文件
层的源码要包含正向和反向传播

roi_align_layer.cpp

// ------------------------------------------------------------------
// Project: Mask R-CNN 
// File: ROIAlignLayer
// Adopted from roi_pooling_layer.cu (written by Ross Grischik)
// Author: Jasjeet Dhaliwal
// ------------------------------------------------------------------
#include <cfloat>
#include <algorithm> 
#include <stdlib.h> 
#include <string>
#include <utility>
#include <vector>

#include "caffe/layers/roi_align_layer.hpp"

using std::max;
using std::min;
using std::floor;
using std::ceil;
using std::vector; 
using std::fabs; 
namespace caffe {

template <typename Dtype> 
void ROIAlignLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, 
                                      const vector<Blob<Dtype>*>& top) 
{
  ROIAlignParameter roi_align_param = this->layer_param_.roi_align_param();
  CHECK_GT(roi_align_param.pooled_h(), 0)
      << "pooled_h must be > 0";
  CHECK_GT(roi_align_param.pooled_w(), 0)
      << "pooled_w must be > 0";
  pooled_height_ = roi_align_param.pooled_h();
  pooled_width_ = roi_align_param.pooled_w();
  spatial_scale_ = roi_align_param.spatial_scale();
  LOG(INFO) << "Spatial scale: " << spatial_scale_;
}

template <typename Dtype>
void ROIAlignLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
                                   const vector<Blob<Dtype>*>& top) 
{
  channels_ = bottom[0]->channels();
  height_ = bottom[0]->height();
  width_ = bottom[0]->width();
  top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,
      pooled_width_);
  int shape_init[] = {bottom[1]->num(), channels_, pooled_height_,
      pooled_width_, 4};
  const vector<int> shape(shape_init, shape_init + sizeof(shape_init) 
      / sizeof(int));
  max_mult_.Reshape(shape); 
  max_pts_.Reshape(shape);
}

template <typename Dtype>
void ROIAlignLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
                                         const vector<Blob<Dtype>*>& top) 
{ 
  LOG(INFO) << "DOING CPU FORWARD NOW "; 
  const Dtype* bottom_data = bottom[0]->cpu_data();
  const Dtype* bottom_rois = bottom[1]->cpu_data();
  // Number of ROIs
  int num_rois = bottom[1]->num();
  int batch_size = bottom[0]->num();
  int top_count = top[0]->count();
  Dtype* top_data = top[0]->mutable_cpu_data();
  caffe_set(top_count, Dtype(-FLT_MAX), top_data);
  int* argmax_idx = max_pts_.mutable_cpu_data();
  Dtype* argmax_mult = max_mult_.mutable_cpu_data(); 
  caffe_set(top_count*4, -1, argmax_idx);
  caffe_set(top_count*4, Dtype(-FLT_MAX), argmax_mult);
  //std::cout << "TOTAL = " << num_rois*channels_*height_*width_ << "\n"; 
  // For each ROI R = [batch_index x1 y1 x2 y2]:
  for (int n = 0; n < num_rois; ++n) {

    int roi_batch_ind = bottom_rois[0];
    Dtype roi_start_w = bottom_rois[1] * spatial_scale_;
    Dtype roi_start_h = bottom_rois[2] * spatial_scale_;
    Dtype roi_end_w = bottom_rois[3] * spatial_scale_;
    Dtype roi_end_h = bottom_rois[4] * spatial_scale_;
    CHECK_GE(roi_batch_ind, 0);
    CHECK_LT(roi_batch_ind, batch_size);
      if (n != roi_batch_ind) {
        continue;
      }
    //Util Values
    Dtype one = 1.0; 
    Dtype zero = 0.0; 

    Dtype roi_height = max(roi_end_h - roi_start_h, one);  
    Dtype roi_width = max(roi_end_w - roi_start_w, one);
    const Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height_);
    const Dtype bin_size_w = roi_width  /  static_cast<Dtype>(pooled_width_);

    const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
    int argmax_offset_init[] = {0,1,0,0,0}; 
    const vector<int> offset_argmax(argmax_offset_init, 
                 argmax_offset_init + sizeof(argmax_offset_init) /sizeof(int)); 
    for (int c = 0; c < channels_; ++c) {
      for (int ph = 0; ph < pooled_height_; ++ph) {
        for (int pw = 0; pw < pooled_width_; ++pw) {

          Dtype hstart = static_cast<Dtype>(ph) * bin_size_h;
          Dtype wstart = static_cast<Dtype>(pw) * bin_size_w;
          Dtype hend = static_cast<Dtype>(ph + 1)* bin_size_h;
          Dtype wend =static_cast<Dtype>(pw + 1) * bin_size_w;

          hstart = min(max(hstart + roi_start_h, zero), static_cast<Dtype>(height_));
          hend = min(max(hend + roi_start_h, zero), static_cast<Dtype>(height_));
          wstart = min(max(wstart + roi_start_w, zero), static_cast<Dtype>(width_));
          wend = min(max(wend + roi_start_w, zero), static_cast<Dtype>(width_));

          Dtype maxvalue = -FLT_MAX; 
          int maxidx[4];
          Dtype maxmult[4];   
          bool is_empty = (hend <= hstart) || (wend <= wstart);

          const int pool_index = ph * pooled_width_ + pw;
          int argmax_index = (ph * pooled_width_ + pw) * 4;
          if (is_empty) {
            maxvalue = 0;
            for (int i = 0; i<4; ++i) {
              maxidx[i] = -1;
              maxmult[i] = -FLT_MAX; 
            } 
          }
            Dtype samples_n[8] = {-0.5, -0.5, -0.5, 0.5, 
                                   0.5, -0.5, 0.5, 0.5}; 
            Dtype bisampled[4];
            int counter = 0;  
            Dtype x_smp_n = -2.0, y_smp_n = -2.0, h_idx_n = -2.0, w_idx_n = -2.0; 
           
            //Bilinearly Interpolate 4 sampled values
            for (int smp = 0; smp < sizeof(samples_n)/sizeof(*samples_n) ; smp+=2) {
              x_smp_n = samples_n[smp]; 
              y_smp_n = samples_n[smp+1]; 
              
              bisampled[smp/2] = 0.0;
              int b_index[4] = {-1, -1 , -1, -1}; //, -1,-1,-1,-1}; 
              int b_index_curr[4] = {-1, -1 , -1, -1}; //, -1,-1,-1,-1};
              Dtype multiplier[4] = {Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)};   
                                       //Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)};  

              counter = 0;
              for (int h_idx = floor(hstart); h_idx <= ceil(hend) && h_idx < height_; ++h_idx) {
                for (int w_idx = floor(wstart); w_idx <= ceil(wend) && w_idx < width_; ++w_idx) {
                  if (counter < 4) {
                    b_index[counter] = ((((n*channels_ + c) * height_) + h_idx ) * width_ )+ w_idx; 
                    b_index_curr[counter] = (h_idx*width_) + w_idx; 
                    //Normalize h_idx and w_idx
                    h_idx_n =  static_cast<Dtype>( (2*(static_cast<Dtype>(h_idx) - roi_start_h) / (roi_end_h - roi_start_h)) - 1);
                    w_idx_n =  static_cast<Dtype>( (2*(static_cast<Dtype>(w_idx) - roi_start_w) / (roi_end_w - roi_start_w))  - 1);
                    h_idx_n = min(max(h_idx_n, static_cast<Dtype>(-1.0)),one);
                    w_idx_n = min(max(w_idx_n, static_cast<Dtype>(-1.0)),one);
                    multiplier[counter] = max(zero,static_cast<Dtype>(1 - fabs(x_smp_n - w_idx_n))) 
                                             * max(zero,static_cast<Dtype>(1 - fabs(y_smp_n - h_idx_n)));

                    bisampled[smp/2] += batch_data[b_index_curr[counter]]*multiplier[counter];  
                    ++counter; 
		 } else { 
		    goto stop;  
		 }
                } // w_idx
              } //h_idx
              stop:
              if (bisampled[smp/2] > maxvalue) {
                maxvalue = bisampled[smp/2];
                for (int i=0; i<4;++i) {
                  maxidx[i] = b_index[i];
                  maxmult[i] = multiplier[i];  
	        }

              }
            } //smp
            //Store value in the top blob
            top_data[pool_index] = maxvalue;
            for (int i = 0; i<4; ++i, ++argmax_index) {
              argmax_idx[argmax_index] = maxidx[i]; 
              argmax_mult[argmax_index] = maxmult[i]; 
            }
        } //pw
      } // ph
      // Increment all data pointers by one channel
      batch_data += bottom[0]->offset(0, 1);
      top_data += top[0]->offset(0, 1);
      if ( (c+1) < channels_ ){
        argmax_idx += max_pts_.offset(offset_argmax);
        argmax_mult += max_mult_.offset(offset_argmax); 
      }
    } // channels
    // Increment ROI data pointer
    bottom_rois += bottom[1]->offset(1);
  }//num_rois
}

template <typename Dtype>
void ROIAlignLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

  const Dtype* bottom_rois = bottom[1]->cpu_data();
  const Dtype* top_diff = top[0]->cpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  const int count = bottom[0]->count();
  caffe_set(count, Dtype(0.), bottom_diff);
  
  int num_rois = bottom[1]->num();
  int batch_size = bottom[0]->num(); 
  const int* argmax_idx = max_pts_.cpu_data();
  const Dtype* argmax_mult = max_mult_.cpu_data(); 
  
  int index = 0; //Current index
 // std::cout <<"Batch = " << batch_size << "\n";
  for (int b = 0; b < batch_size; ++b){ 
    for (int c = 0; c < channels_; ++c){
      for (int h = 0; h < height_; ++h) {
        for (int w = 0; w < width_; ++w) {
          index = ( ( ( ( (b * channels_ ) + c ) * height_ ) + h) * width_) + w; 
  	  // Go over every ROI 
          Dtype gradient = 0.0; 
  	  for (int n = 0; n < num_rois; ++n) {
    	    const Dtype* offset_bottom_rois = bottom_rois + n * 5;
            int roi_batch_ind = offset_bottom_rois[0];
            CHECK_GE(roi_batch_ind, 0);
    	    CHECK_LT(roi_batch_ind, batch_size);

            int offset = (n * channels_ + c) * pooled_height_ * pooled_width_;
            int argmax_offset = offset * 4;
            const Dtype* offset_top_diff = top_diff + offset;
            const int* offset_argmax_idx = argmax_idx + argmax_offset;
            const Dtype* offset_argmax_mult = argmax_mult + argmax_offset;
            Dtype multiplier = 0.0;  
            for (int ph = 0; ph < pooled_height_; ++ph) {
              for (int pw = 0; pw < pooled_width_; ++pw) {
                for (int k = 0; k < 4; ++k) {
                  if (offset_argmax_idx[((ph * pooled_width_ + pw) * 4) + k] == index) {
                    multiplier = offset_argmax_mult[( (ph * pooled_width_ + pw) * 4) + k];
                    gradient+= offset_top_diff[ph * pooled_width_ + pw] * multiplier; 
                  }
                } 
              }//Pw
            } //Ph 
	}// rois
        bottom_diff[index] = gradient; 
      }// width
    }//height
   }//channels
  }//count
}
#ifdef CPU_ONLY
STUB_GPU(ROIAlignLayer);
#endif

INSTANTIATE_CLASS(ROIAlignLayer);
REGISTER_LAYER_CLASS(ROIAlign);

}  // namespace caffe

在相同路径下，添加在gpu中运行的.cu文件

roi_align_layer.cu

// ------------------------------------------------------------------
// Project: Mask R-CNN 
// File: ROIAlignLayer
// Adopted from roi_pooling_layer.cu (written by Ross Grischik)
// Author: Jasjeet Dhaliwal
// ------------------------------------------------------------------

#include <cfloat>
#include <iostream>
#include <string>
#include <utility>
#include <vector>

#include <algorithm> 
#include <stdlib.h> 

#include "caffe/layers/roi_align_layer.hpp"
// #include "caffe/blob.hpp"
// #include "caffe/common.hpp"
// #include "caffe/layer.hpp"
// #include "caffe/vision_layers.hpp"
// #include "caffe/proto/caffe.pb.h"

using std::max;
using std::min;
using std::floor; 
using std::ceil; 
using std::fabs; 
using std::cout; 

namespace caffe {

template <typename Dtype>
__global__ void ROIAlignForward(const int nthreads, const Dtype* bottom_data,
    const Dtype spatial_scale, const int channels, const int height,
    const int width, const int pooled_height, const int pooled_width,
    const Dtype* bottom_rois, Dtype* top_data, int* argmax_idx, Dtype* argmax_mult) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    int argmax_index = index * 4; 

    bottom_rois += n * 5;
    int roi_batch_ind = bottom_rois[0];
    Dtype roi_start_w = bottom_rois[1] * spatial_scale;
    Dtype roi_start_h = bottom_rois[2] * spatial_scale;
    Dtype roi_end_w = bottom_rois[3] * spatial_scale;
    Dtype roi_end_h = bottom_rois[4] * spatial_scale;
    
    //Util Values
    Dtype zero = 0.0, one = 1.0;
 
    // Force malformed ROIs to be 1x1
    Dtype roi_width = max(roi_end_w - roi_start_w + 1.0, one);
    Dtype roi_height = max(roi_end_h - roi_start_h + 1.0, one);
    Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
    Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);

    Dtype hstart = static_cast<Dtype>(ph) * bin_size_h;
    Dtype wstart = static_cast<Dtype>(pw) * bin_size_w;
    Dtype hend = static_cast<Dtype>(ph + 1) * bin_size_h;
    Dtype wend = static_cast<Dtype>(pw + 1) * bin_size_w;

    // Add roi offsets and clip to input boundaries
    hstart = min(max(hstart + roi_start_h, zero), static_cast<Dtype>(height) );
    hend = min(max(hend + roi_start_h, zero), static_cast<Dtype>(height));
    wstart = min(max(wstart + roi_start_w, zero), static_cast<Dtype>(width));
    wend = min(max(wend + roi_start_w, zero), static_cast<Dtype>(width));
    
    bool is_empty = (hend <= hstart) || (wend <= wstart);

    // Define an empty pooling region to be zero

   

    Dtype maxvalue = is_empty ? 0 : -FLT_MAX;
    int maxidx[4];
    Dtype maxmult[4];  
    //int bottom_offset =  (roi_batch_ind * channels + c) * height * width ;
    //bottom_data += (roi_batch_ind * channels + c) * height * width;
    /* Normalization function - normalizes values between -1 and 1. 
    a = -1, b = 1
    y = f(x) = [[(b - a) (x - roi_start_h)] / [roi_end_h - roi_start_h]] + a
    x = f^{-1}(y) = [[(f(x) - a)(roi_end_h - roi_end_h)] / (b - a)] + roi_start_h 
    Normalized coordinates of 4 regularly sampled points in the ROI: 
    sn_1 = (-0.5,-0.5) 
    sn_2 = (-0.5,0.5) 
    sn_3 = (0.5,-0.5) 
    sn_4 = (0.5,0.5)

    // Debugging purposes
    Dtype x_pos = (((0.5 + 1)*(roi_end_w - roi_start_w))/2.0) + roi_start_w;
    Dtype x_neg = (((-0.5 + 1)*(roi_end_w - roi_start_w))/2.0) + roi_start_w;
    Dtype y_pos = (((0.5 + 1)*(roi_end_h - roi_start_h))/2.0) + roi_start_h;
    Dtype y_neg = (((-0.5 + 1)*(roi_end_h - roi_start_h))/2.0) + roi_start_h;
    Dtype samples[2] = {x_neg, y_neg, x_neg, y_pos,
                        x_pos, y_neg, x_pos, y_pos};
    */
    
    Dtype samples_n[8] = {-0.5, -0.5, -0.5, 0.5, 
                           0.5, -0.5, 0.5, 0.5}; 
    //Holds interpolated values for each sample point
    Dtype bisampled[4]; 
    int counter = 0; 
    Dtype x_smp_n = -2.0, y_smp_n = -2.0, h_idx_n = -2.0, w_idx_n = -2.0;   
    //Bilinearly Interpolate 4 sampled values
    for (int smp = 0; smp < sizeof(samples_n)/sizeof(*samples_n) ; smp+=2) {
      x_smp_n = samples_n[smp]; 
      y_smp_n = samples_n[smp+1]; 

      bisampled[smp/2] = 0.0;
      int b_index[4] = {-1, -1 , -1, -1}; // -1,-1,-1,-1}; 
      //int b_index_curr[4] = {-1,-1,-1,-1}; 
      Dtype multiplier[4] = {Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)}; 
					//Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)};  
      counter = 0; 
      //ceil(hstart)
      //floor(hend)
      for (int h_idx = ceil(hstart); h_idx <= floor(hend) && h_idx <= height && h_idx >= 0 ; ++h_idx) {
        for (int w_idx =ceil(wstart); w_idx <= floor(wend) && w_idx <= width && w_idx >= 0; ++w_idx) {
       if (counter < 4) {
            b_index[counter] =  ((((roi_batch_ind * channels) + c) * height) + h_idx) * width + w_idx; 
        //    b_index_curr[counter]= h_idx*width + w_idx; 
            //Normalize width and height to lie between -1 and 1
            h_idx_n = static_cast<Dtype>( (static_cast<Dtype>(2)*(static_cast<Dtype>(h_idx) - roi_start_h) / (roi_end_h - roi_start_h)) - 1); 
            w_idx_n =  static_cast<Dtype>((static_cast<Dtype>(2)*(static_cast<Dtype>(w_idx) - roi_start_w) / (roi_end_w - roi_start_w))  - 1);
            h_idx_n = min(max(h_idx_n, static_cast<Dtype>(-1.0)),one);  
            w_idx_n = min(max(w_idx_n, static_cast<Dtype>(-1.0)),one);  
            multiplier[counter]=  max(zero ,static_cast<Dtype>(1 - fabs(x_smp_n - w_idx_n))) * max(zero,static_cast<Dtype>(1 - fabs(y_smp_n - h_idx_n)));
            //bisampled[smp/2] += multiplier[counter]; 
            bisampled[smp/2] += bottom_data[ b_index[counter]] * multiplier[counter];
            ++counter; 
	  } else {
	     goto stop; 
	  }
        } //w
      }//h
      stop:
      if (bisampled[smp/2] > maxvalue) {
        maxvalue = bisampled[smp/2]; 
        //Using two loops to comply with c++ convention
        for (int i=0; i<4;++i) {
          maxidx[i] = b_index[i];
          maxmult[i] = multiplier[i]; 
	}

      }
    } //smp
    //Store value in the top blob
    top_data[index] = maxvalue;
    for (int i = 0; i<4; ++i, ++argmax_index) {
      argmax_idx[argmax_index] = maxidx[i];
      argmax_mult[argmax_index] = maxmult[i]; 
    }
  }
}

template <typename Dtype>
void ROIAlignLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  const Dtype* bottom_rois = bottom[1]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
  int* argmax_idx = max_pts_.mutable_gpu_data();
  Dtype* argmax_mult = max_mult_.mutable_gpu_data(); 
  int count = top[0]->count();
  LOG(INFO) << "Doing forward now";   
  // NOLINT_NEXT_LINE(whitespace/operators)
  //Change CAFFE_CUDA_NUM_THREADS to 64
  ROIAlignForward<Dtype><<<CAFFE_GET_BLOCKS(count), 32>>>(
      count, bottom_data, spatial_scale_, channels_, height_, width_,
      pooled_height_, pooled_width_, bottom_rois, top_data, argmax_idx, argmax_mult);
   LOG(INFO) << "Done forward "; 
  CUDA_POST_KERNEL_CHECK;
}

template <typename Dtype>
__global__ void ROIAlignBackward(const int nthreads, const Dtype* top_diff,
    const int* argmax_idx, const Dtype* argmax_mult, const int num_rois, const Dtype spatial_scale,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, Dtype* bottom_diff,
    const Dtype* bottom_rois) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    // (n, c, h, w) coords in bottom data
    int w = index % width;
    int h = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;
    
    Dtype gradient = 0.0;
    // Accumulate gradient over all ROIs that pooled this element
    for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
      //const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
      //int roi_batch_ind = offset_bottom_rois[0];
      // Skip if ROI's batch index doesn't match n
     // if (n != roi_batch_ind) {
       // continue;
     // }
      const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
      int roi_batch_ind = offset_bottom_rois[0];
      // Skip if ROI's batch index doesn't match n
      if (n != roi_batch_ind) {
        continue;
      }

      int roi_start_w = ceil(offset_bottom_rois[1] * spatial_scale);
      int roi_start_h = ceil(offset_bottom_rois[2] * spatial_scale);
      int roi_end_w = floor(offset_bottom_rois[3] * spatial_scale);
      int roi_end_h = floor(offset_bottom_rois[4] * spatial_scale);

      // Skip if ROI doesn't include (h, w)
      const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                           h >= roi_start_h && h <= roi_end_h);
      if (!in_roi) {
        continue;
      }

      int offset = (roi_n * channels + c) * pooled_height * pooled_width;
      int argmax_offset = offset * 4; 
      const Dtype* offset_top_diff = top_diff + offset;
      const int* offset_argmax_idx = argmax_idx + argmax_offset;
      const Dtype* offset_argmax_mult = argmax_mult + argmax_offset;
      // Util Vals
      Dtype multiplier = 0.0; 
      for (int ph = 0; ph < pooled_height; ++ph) {
        for (int pw = 0; pw < pooled_width; ++pw) {
          for (int k = 0; k < 4; ++k) {
            if (offset_argmax_idx[((ph * pooled_width + pw) * 4) + k] == index  ) {
              multiplier = offset_argmax_mult[( (ph * pooled_width + pw) * 4) + k]; 
              gradient += offset_top_diff[ph * pooled_width + pw] * multiplier;
            }
	  }
        }//pw
      }//ph
    }//rois
    bottom_diff[index] = gradient;
  }
}

template <typename Dtype>
void ROIAlignLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (!propagate_down[0]) {
    return;
  }
  const Dtype* bottom_rois = bottom[1]->gpu_data();
  const Dtype* top_diff = top[0]->gpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  const int count = bottom[0]->count();
  caffe_gpu_set(count, Dtype(0.), bottom_diff);
  const int* argmax_idx = max_pts_.gpu_data();
  const Dtype* argmax_mult = max_mult_.gpu_data(); 
  // NOLINT_NEXT_LINE(whitespace/operators)
  // CAFFE_CUDA_NUM_THREADS replaced with 64
   LOG(INFO) << "Doing backward "; 
  ROIAlignBackward<Dtype><<<CAFFE_GET_BLOCKS(count), 16>>>(
      count, top_diff, argmax_idx, argmax_mult, top[0]->num(), spatial_scale_, channels_,
      height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
  LOG(INFO) << "Done backward"; 
  CUDA_POST_KERNEL_CHECK;
}

INSTANTIATE_LAYER_GPU_FUNCS(ROIAlignLayer);

}  // namespace caffe

测试文件

测试文件是为了验证新定义的层在CPU和GPU下正向和反向传播是否正确，可选。一般每一个层都有对应的测试文件，放在./src/caffe/tes/路径下

test_roi_align_layer.cpp

// ------------------------------------------------------------------
// Fast R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Ross Girshick
// ------------------------------------------------------------------

#include <cmath>
#include <cstdlib>
#include <cstring>
#include <vector>

#include "boost/scoped_ptr.hpp"
#include "gtest/gtest.h"

#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/filler.hpp"
#include "caffe/layers/roi_pooling_layer.hpp"

#include "caffe/test/test_caffe_main.hpp"
#include "caffe/test/test_gradient_check_util.hpp"

using boost::scoped_ptr;

namespace caffe {

typedef ::testing::Types<GPUDevice<float>, GPUDevice<double> > TestDtypesGPU;

template <typename TypeParam>
class ROIAlignLayerTest : public MultiDeviceTest<TypeParam> {
  typedef typename TypeParam::Dtype Dtype;

 protected:
  ROIAlignLayerTest()
      : blob_bottom_data_(new Blob<Dtype>(4, 3, 12, 8)),
        blob_bottom_rois_(new Blob<Dtype>(4, 5, 1, 1)),
        blob_top_data_(new Blob<Dtype>()) {
    // fill the values
    FillerParameter filler_param;
    filler_param.set_std(10);
    GaussianFiller<Dtype> filler(filler_param);
    filler.Fill(this->blob_bottom_data_);
    //for (int i = 0; i < blob_bottom_data_->count(); ++i) {
    //  blob_bottom_data_->mutable_cpu_data()[i] = i;
    //}
    blob_bottom_vec_.push_back(blob_bottom_data_);
    int i = 0;
    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0; //caffe_rng_rand() % 4;
    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 1; // x1 < 8
    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1; // y1 < 12
    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 6; // x2 < 8
    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 6; // y2 < 12
    i = 1;
    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 2;
    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 6; // x1 < 8
    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 2; // y1 < 12
    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 7; // x2 < 8
    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 11; // y2 < 12
    i = 2;
    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 1;
    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3; // x1 < 8
    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1; // y1 < 12
    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 5; // x2 < 8
    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 10; // y2 < 12
    i = 3;
    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3; // x1 < 8
    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 3; // y1 < 12
    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 3; // x2 < 8
    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 3; // y2 < 12

    blob_bottom_vec_.push_back(blob_bottom_rois_);
    blob_top_vec_.push_back(blob_top_data_);
  }
  virtual ~ROIAlignLayerTest() {
    delete blob_bottom_data_;
    delete blob_bottom_rois_;
    delete blob_top_data_;
  }
  Blob<Dtype>* const blob_bottom_data_;
  Blob<Dtype>* const blob_bottom_rois_;
  Blob<Dtype>* const blob_top_data_;
  vector<Blob<Dtype>*> blob_bottom_vec_;
  vector<Blob<Dtype>*> blob_top_vec_;
};

TYPED_TEST_CASE(ROIAlignLayerTest, TestDtypesGPU);

TYPED_TEST(ROIAlignLayerTest, TestGradient) {
  typedef typename TypeParam::Dtype Dtype;
  LayerParameter layer_param;
  ROIAlignParameter* roi_align_param =
      layer_param.mutable_roi_pooling_param();
  roi_align_param->set_pooled_h(6);
  roi_align_param->set_pooled_w(6);
  ROIAlignLayer<Dtype> layer(layer_param);
  GradientChecker<Dtype> checker(1e-4, 1e-2);
  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
      this->blob_top_vec_, 0);
}

}  // namespace caffe

至此，需要添加完全自定义的文件就已经就位了，当然头文件和测试文件参照同类型的层写还是可以的。

修改文件

在./src/caffe/proto/caffe/proto文件夹中的caffe.proto文件中增加自定义层的相关参数和层的ID号

在caffe.proto文件中查找【message LayerParameter {···} 】并找到当前已有层的最大ID号，不妨假设当前max_id=151，则在{···}末尾新增一行 optional LayernamexxParameter layernamexx_param=152;

在caffe.proto文件中添加可以在网络结构.prototxt文件中调用层时可定的参数
- unit32,float为参数类型；
- = 之后连接的是第几个参数，从1开始； - [default=] 连接的是参数的默认值；

    //============================ added by tower =============================
    // Message that stores parameters used by ROIAlignLayer
    message ROIAlignParameter {
      // Pad, kernel size, and stride are all given as a single value for equal
      // dimensions in height and width or as Y, X pairs.
      optional uint32 pooled_h = 1 [default = 7]; // The pooled output height
      optional uint32 pooled_w = 2 [default = 7]; // The pooled output width
      // Multiplicative spatial scale factor to translate ROI coords from their
      // input scale to the scale used when pooling
      optional float spatial_scale = 3 [default = 0.625];
    }
    //=========================================================================

至此，需要改动的地方就已经全部完成啦~

重新编译

首先切换到caffe根目录，cd ~/mask/faster-mask/caffe，使用命令清空编译的记录

[Yanglp@lenovo1 caffe]$ make clean

然后可以看到根目录下的隐藏文件夹./build_release被删除了，

接着先编译caffe根目录之外的lib文件夹

[Yanglp@lenovo1 caffe]$ cd ../lib
[Yanglp@lenovo1 lib]$ make
python setup.py build_ext --inplace
running build_ext
skipping 'utils/bbox.c' Cython extension (up-to-date)
skipping 'nms/cpu_nms.c' Cython extension (up-to-date)
skipping 'nms/gpu_nms.cpp' Cython extension (up-to-date)
skipping 'pycocotools/_mask.c' Cython extension (up-to-date)
rm -rf build

解决编译报错

编译lib完成后，再回到caffe目录下make，但此时编译会报错：

问题1：编译器版本不对

解决方法：切换g++和gcc版本，以下分别是查看编译器和切换版本的命令：

[Yanglp@lenovo1 lib]$ g++ --version
g++ (GCC) 4.4.7 20120313 (Red Hat 4.4.7-17)
Copyright (C) 2010 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

[Yanglp@lenovo1 lib]$ scl enable devtoolset-1.1 bash
[Yanglp@lenovo1 lib]$ g++ --version
g++ (GCC) 4.7.2 20121015 (Red Hat 4.7.2-5)
Copyright (C) 2012 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

[Yanglp@lenovo1 lib]$ cd ../caffe

问题2 ：找不到caffe.pb.h文件

  make: *** [.build_release/src/caffe/blob.o] Error 1
  In file included from ./include/caffe/util/device_alternate.hpp:40:0,
                   from ./include/caffe/common.hpp:19,
                   from src/caffe/util/io.cpp:18:
  ./include/caffe/util/cudnn.hpp:8:34: fatal error: caffe/proto/caffe.pb.h: No such file or directory
  compilation terminated.
  make: *** [.build_release/src/caffe/util/io.o] Error 1
  In file included from ./include/caffe/util/device_alternate.hpp:40:0,
                   from ./include/caffe/common.hpp:19,
                   from src/caffe/util/ma
                   th_functions.cpp:6:
  ./include/caffe/util/cudnn.hpp:8:34: fatal error: caffe/proto/caffe.pb.h: No such file or directory
  compilation terminated.

解决方法：由于我们修改了./src/caffe/proto/caffe/proto/caffe.proto文件又make clean了，就需要自己生成对应的caffe.pb.h

[Yanglp@lenovo1 caffe]$ cd /share/manage/Yanglp/mask/faster-mask/caffe/src/caffe/proto/
[Yanglp@lenovo1 proto]$ protoc --cpp_out=/share/manage/Yanglp/mask/faster-mask/caffe/include/caffe/ caffe.proto
[Yanglp@lenovo1 proto]$ cd ../../../..
[Yanglp@lenovo1 caffe]$ find ./ -name "*caffe.pb.h*"
./include/caffe/caffe.pb.h
[Yanglp@lenovo1 caffe]$

使用protoc命令将caffe.proto文件生成对应的caffe.pb.h文件即可

再次编译

[Yanglp@lenovo1 caffe]$ make -j8 && make pycaffe

可以看到，自定义的层顺利通过编译

Caffe源码

自定义Caffe层并编译