Mask R-CNN在源码中添加ROI Align层
RoI ( Region of Interest ), 即为原图中的Region Proposals映射到特征图( feature map )上的框。Fast R-CNN中首次提出的ROI Pooling,由于在下采样过程中两次取整量化的操作,使得RoI有些许的位置偏差。因而在Mask R-CNN中提出了ROI Align,通过使用双线性插值计算小数坐标位置的值,结果更加精确。
添加文件
需要添加的文件目录
./faster-mask/caffe
|--./include/caffe/layers/roi_align_layer.hpp
|--./src/caffe/layers/roi_align_layer.cpp
|--./src/caffe/layers/roi_align_layer.cu
|--./src/caffe/test/test_roi_align_layer.cpp
头文件
- ./caffe目录下,include文件夹下放的都是层的.hpp头文件,前缀名与src中层相对应
- 头文件要包含定义层中使用的类变量和类型,声明使用的函数
roi_align_layer.hpp
#ifndef CAFFE_ROI_POOLING_LAYER_HPP_
#define CAFFE_ROI_POOLING_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
namespace caffe {
template <typename Dtype>
class ROIAlignLayer : public Layer<Dtype> {
public:
explicit ROIAlignLayer(const LayerParameter& param)
: Layer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "ROIAlign"; }
virtual inline int MinBottomBlobs() const { return 2; }
virtual inline int MaxBottomBlobs() const { return 2; }
virtual inline int MinTopBlobs() const { return 1; }
virtual inline int MaxTopBlobs() const { return 1; }
protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
int channels_;
int height_;
int width_;
int pooled_height_;
int pooled_width_;
Dtype spatial_scale_;
//====ROIAlign special param====
Blob<Dtype> max_mult_;
Blob<int> max_pts_;
//==============================
};
} // namespace caffe
#endif // CAFFE_ROI_Align_LAYER_HPP_
源文件
- /caffe目录下,src文件夹下放的都是层的.cpp源文件,前缀名即为层的名字
- 每个层的源码一般要包括.cpp和.cu文件,.cu文件是在gpu上运行时执行的文件
- 层的源码要包含正向和反向传播
roi_align_layer.cpp
// ------------------------------------------------------------------
// Project: Mask R-CNN
// File: ROIAlignLayer
// Adopted from roi_pooling_layer.cu (written by Ross Grischik)
// Author: Jasjeet Dhaliwal
// ------------------------------------------------------------------
#include <cfloat>
#include <algorithm>
#include <stdlib.h>
#include <string>
#include <utility>
#include <vector>
#include "caffe/layers/roi_align_layer.hpp"
using std::max;
using std::min;
using std::floor;
using std::ceil;
using std::vector;
using std::fabs;
namespace caffe {
template <typename Dtype>
void ROIAlignLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top)
{
ROIAlignParameter roi_align_param = this->layer_param_.roi_align_param();
CHECK_GT(roi_align_param.pooled_h(), 0)
<< "pooled_h must be > 0";
CHECK_GT(roi_align_param.pooled_w(), 0)
<< "pooled_w must be > 0";
pooled_height_ = roi_align_param.pooled_h();
pooled_width_ = roi_align_param.pooled_w();
spatial_scale_ = roi_align_param.spatial_scale();
LOG(INFO) << "Spatial scale: " << spatial_scale_;
}
template <typename Dtype>
void ROIAlignLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top)
{
channels_ = bottom[0]->channels();
height_ = bottom[0]->height();
width_ = bottom[0]->width();
top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,
pooled_width_);
int shape_init[] = {bottom[1]->num(), channels_, pooled_height_,
pooled_width_, 4};
const vector<int> shape(shape_init, shape_init + sizeof(shape_init)
/ sizeof(int));
max_mult_.Reshape(shape);
max_pts_.Reshape(shape);
}
template <typename Dtype>
void ROIAlignLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top)
{
LOG(INFO) << "DOING CPU FORWARD NOW ";
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* bottom_rois = bottom[1]->cpu_data();
// Number of ROIs
int num_rois = bottom[1]->num();
int batch_size = bottom[0]->num();
int top_count = top[0]->count();
Dtype* top_data = top[0]->mutable_cpu_data();
caffe_set(top_count, Dtype(-FLT_MAX), top_data);
int* argmax_idx = max_pts_.mutable_cpu_data();
Dtype* argmax_mult = max_mult_.mutable_cpu_data();
caffe_set(top_count*4, -1, argmax_idx);
caffe_set(top_count*4, Dtype(-FLT_MAX), argmax_mult);
//std::cout << "TOTAL = " << num_rois*channels_*height_*width_ << "\n";
// For each ROI R = [batch_index x1 y1 x2 y2]:
for (int n = 0; n < num_rois; ++n) {
int roi_batch_ind = bottom_rois[0];
Dtype roi_start_w = bottom_rois[1] * spatial_scale_;
Dtype roi_start_h = bottom_rois[2] * spatial_scale_;
Dtype roi_end_w = bottom_rois[3] * spatial_scale_;
Dtype roi_end_h = bottom_rois[4] * spatial_scale_;
CHECK_GE(roi_batch_ind, 0);
CHECK_LT(roi_batch_ind, batch_size);
if (n != roi_batch_ind) {
continue;
}
//Util Values
Dtype one = 1.0;
Dtype zero = 0.0;
Dtype roi_height = max(roi_end_h - roi_start_h, one);
Dtype roi_width = max(roi_end_w - roi_start_w, one);
const Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height_);
const Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width_);
const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
int argmax_offset_init[] = {0,1,0,0,0};
const vector<int> offset_argmax(argmax_offset_init,
argmax_offset_init + sizeof(argmax_offset_init) /sizeof(int));
for (int c = 0; c < channels_; ++c) {
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
Dtype hstart = static_cast<Dtype>(ph) * bin_size_h;
Dtype wstart = static_cast<Dtype>(pw) * bin_size_w;
Dtype hend = static_cast<Dtype>(ph + 1)* bin_size_h;
Dtype wend =static_cast<Dtype>(pw + 1) * bin_size_w;
hstart = min(max(hstart + roi_start_h, zero), static_cast<Dtype>(height_));
hend = min(max(hend + roi_start_h, zero), static_cast<Dtype>(height_));
wstart = min(max(wstart + roi_start_w, zero), static_cast<Dtype>(width_));
wend = min(max(wend + roi_start_w, zero), static_cast<Dtype>(width_));
Dtype maxvalue = -FLT_MAX;
int maxidx[4];
Dtype maxmult[4];
bool is_empty = (hend <= hstart) || (wend <= wstart);
const int pool_index = ph * pooled_width_ + pw;
int argmax_index = (ph * pooled_width_ + pw) * 4;
if (is_empty) {
maxvalue = 0;
for (int i = 0; i<4; ++i) {
maxidx[i] = -1;
maxmult[i] = -FLT_MAX;
}
}
Dtype samples_n[8] = {-0.5, -0.5, -0.5, 0.5,
0.5, -0.5, 0.5, 0.5};
Dtype bisampled[4];
int counter = 0;
Dtype x_smp_n = -2.0, y_smp_n = -2.0, h_idx_n = -2.0, w_idx_n = -2.0;
//Bilinearly Interpolate 4 sampled values
for (int smp = 0; smp < sizeof(samples_n)/sizeof(*samples_n) ; smp+=2) {
x_smp_n = samples_n[smp];
y_smp_n = samples_n[smp+1];
bisampled[smp/2] = 0.0;
int b_index[4] = {-1, -1 , -1, -1}; //, -1,-1,-1,-1};
int b_index_curr[4] = {-1, -1 , -1, -1}; //, -1,-1,-1,-1};
Dtype multiplier[4] = {Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)};
//Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)};
counter = 0;
for (int h_idx = floor(hstart); h_idx <= ceil(hend) && h_idx < height_; ++h_idx) {
for (int w_idx = floor(wstart); w_idx <= ceil(wend) && w_idx < width_; ++w_idx) {
if (counter < 4) {
b_index[counter] = ((((n*channels_ + c) * height_) + h_idx ) * width_ )+ w_idx;
b_index_curr[counter] = (h_idx*width_) + w_idx;
//Normalize h_idx and w_idx
h_idx_n = static_cast<Dtype>( (2*(static_cast<Dtype>(h_idx) - roi_start_h) / (roi_end_h - roi_start_h)) - 1);
w_idx_n = static_cast<Dtype>( (2*(static_cast<Dtype>(w_idx) - roi_start_w) / (roi_end_w - roi_start_w)) - 1);
h_idx_n = min(max(h_idx_n, static_cast<Dtype>(-1.0)),one);
w_idx_n = min(max(w_idx_n, static_cast<Dtype>(-1.0)),one);
multiplier[counter] = max(zero,static_cast<Dtype>(1 - fabs(x_smp_n - w_idx_n)))
* max(zero,static_cast<Dtype>(1 - fabs(y_smp_n - h_idx_n)));
bisampled[smp/2] += batch_data[b_index_curr[counter]]*multiplier[counter];
++counter;
} else {
goto stop;
}
} // w_idx
} //h_idx
stop:
if (bisampled[smp/2] > maxvalue) {
maxvalue = bisampled[smp/2];
for (int i=0; i<4;++i) {
maxidx[i] = b_index[i];
maxmult[i] = multiplier[i];
}
}
} //smp
//Store value in the top blob
top_data[pool_index] = maxvalue;
for (int i = 0; i<4; ++i, ++argmax_index) {
argmax_idx[argmax_index] = maxidx[i];
argmax_mult[argmax_index] = maxmult[i];
}
} //pw
} // ph
// Increment all data pointers by one channel
batch_data += bottom[0]->offset(0, 1);
top_data += top[0]->offset(0, 1);
if ( (c+1) < channels_ ){
argmax_idx += max_pts_.offset(offset_argmax);
argmax_mult += max_mult_.offset(offset_argmax);
}
} // channels
// Increment ROI data pointer
bottom_rois += bottom[1]->offset(1);
}//num_rois
}
template <typename Dtype>
void ROIAlignLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* bottom_rois = bottom[1]->cpu_data();
const Dtype* top_diff = top[0]->cpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
const int count = bottom[0]->count();
caffe_set(count, Dtype(0.), bottom_diff);
int num_rois = bottom[1]->num();
int batch_size = bottom[0]->num();
const int* argmax_idx = max_pts_.cpu_data();
const Dtype* argmax_mult = max_mult_.cpu_data();
int index = 0; //Current index
// std::cout <<"Batch = " << batch_size << "\n";
for (int b = 0; b < batch_size; ++b){
for (int c = 0; c < channels_; ++c){
for (int h = 0; h < height_; ++h) {
for (int w = 0; w < width_; ++w) {
index = ( ( ( ( (b * channels_ ) + c ) * height_ ) + h) * width_) + w;
// Go over every ROI
Dtype gradient = 0.0;
for (int n = 0; n < num_rois; ++n) {
const Dtype* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
CHECK_GE(roi_batch_ind, 0);
CHECK_LT(roi_batch_ind, batch_size);
int offset = (n * channels_ + c) * pooled_height_ * pooled_width_;
int argmax_offset = offset * 4;
const Dtype* offset_top_diff = top_diff + offset;
const int* offset_argmax_idx = argmax_idx + argmax_offset;
const Dtype* offset_argmax_mult = argmax_mult + argmax_offset;
Dtype multiplier = 0.0;
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
for (int k = 0; k < 4; ++k) {
if (offset_argmax_idx[((ph * pooled_width_ + pw) * 4) + k] == index) {
multiplier = offset_argmax_mult[( (ph * pooled_width_ + pw) * 4) + k];
gradient+= offset_top_diff[ph * pooled_width_ + pw] * multiplier;
}
}
}//Pw
} //Ph
}// rois
bottom_diff[index] = gradient;
}// width
}//height
}//channels
}//count
}
#ifdef CPU_ONLY
STUB_GPU(ROIAlignLayer);
#endif
INSTANTIATE_CLASS(ROIAlignLayer);
REGISTER_LAYER_CLASS(ROIAlign);
} // namespace caffe
在相同路径下,添加在gpu中运行的.cu文件
roi_align_layer.cu
// ------------------------------------------------------------------
// Project: Mask R-CNN
// File: ROIAlignLayer
// Adopted from roi_pooling_layer.cu (written by Ross Grischik)
// Author: Jasjeet Dhaliwal
// ------------------------------------------------------------------
#include <cfloat>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include "caffe/layers/roi_align_layer.hpp"
// #include "caffe/blob.hpp"
// #include "caffe/common.hpp"
// #include "caffe/layer.hpp"
// #include "caffe/vision_layers.hpp"
// #include "caffe/proto/caffe.pb.h"
using std::max;
using std::min;
using std::floor;
using std::ceil;
using std::fabs;
using std::cout;
namespace caffe {
template <typename Dtype>
__global__ void ROIAlignForward(const int nthreads, const Dtype* bottom_data,
const Dtype spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const Dtype* bottom_rois, Dtype* top_data, int* argmax_idx, Dtype* argmax_mult) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
int argmax_index = index * 4;
bottom_rois += n * 5;
int roi_batch_ind = bottom_rois[0];
Dtype roi_start_w = bottom_rois[1] * spatial_scale;
Dtype roi_start_h = bottom_rois[2] * spatial_scale;
Dtype roi_end_w = bottom_rois[3] * spatial_scale;
Dtype roi_end_h = bottom_rois[4] * spatial_scale;
//Util Values
Dtype zero = 0.0, one = 1.0;
// Force malformed ROIs to be 1x1
Dtype roi_width = max(roi_end_w - roi_start_w + 1.0, one);
Dtype roi_height = max(roi_end_h - roi_start_h + 1.0, one);
Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
Dtype hstart = static_cast<Dtype>(ph) * bin_size_h;
Dtype wstart = static_cast<Dtype>(pw) * bin_size_w;
Dtype hend = static_cast<Dtype>(ph + 1) * bin_size_h;
Dtype wend = static_cast<Dtype>(pw + 1) * bin_size_w;
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, zero), static_cast<Dtype>(height) );
hend = min(max(hend + roi_start_h, zero), static_cast<Dtype>(height));
wstart = min(max(wstart + roi_start_w, zero), static_cast<Dtype>(width));
wend = min(max(wend + roi_start_w, zero), static_cast<Dtype>(width));
bool is_empty = (hend <= hstart) || (wend <= wstart);
// Define an empty pooling region to be zero
Dtype maxvalue = is_empty ? 0 : -FLT_MAX;
int maxidx[4];
Dtype maxmult[4];
//int bottom_offset = (roi_batch_ind * channels + c) * height * width ;
//bottom_data += (roi_batch_ind * channels + c) * height * width;
/* Normalization function - normalizes values between -1 and 1.
a = -1, b = 1
y = f(x) = [[(b - a) (x - roi_start_h)] / [roi_end_h - roi_start_h]] + a
x = f^{-1}(y) = [[(f(x) - a)(roi_end_h - roi_end_h)] / (b - a)] + roi_start_h
Normalized coordinates of 4 regularly sampled points in the ROI:
sn_1 = (-0.5,-0.5)
sn_2 = (-0.5,0.5)
sn_3 = (0.5,-0.5)
sn_4 = (0.5,0.5)
// Debugging purposes
Dtype x_pos = (((0.5 + 1)*(roi_end_w - roi_start_w))/2.0) + roi_start_w;
Dtype x_neg = (((-0.5 + 1)*(roi_end_w - roi_start_w))/2.0) + roi_start_w;
Dtype y_pos = (((0.5 + 1)*(roi_end_h - roi_start_h))/2.0) + roi_start_h;
Dtype y_neg = (((-0.5 + 1)*(roi_end_h - roi_start_h))/2.0) + roi_start_h;
Dtype samples[2] = {x_neg, y_neg, x_neg, y_pos,
x_pos, y_neg, x_pos, y_pos};
*/
Dtype samples_n[8] = {-0.5, -0.5, -0.5, 0.5,
0.5, -0.5, 0.5, 0.5};
//Holds interpolated values for each sample point
Dtype bisampled[4];
int counter = 0;
Dtype x_smp_n = -2.0, y_smp_n = -2.0, h_idx_n = -2.0, w_idx_n = -2.0;
//Bilinearly Interpolate 4 sampled values
for (int smp = 0; smp < sizeof(samples_n)/sizeof(*samples_n) ; smp+=2) {
x_smp_n = samples_n[smp];
y_smp_n = samples_n[smp+1];
bisampled[smp/2] = 0.0;
int b_index[4] = {-1, -1 , -1, -1}; // -1,-1,-1,-1};
//int b_index_curr[4] = {-1,-1,-1,-1};
Dtype multiplier[4] = {Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)};
//Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX), Dtype(-FLT_MAX)};
counter = 0;
//ceil(hstart)
//floor(hend)
for (int h_idx = ceil(hstart); h_idx <= floor(hend) && h_idx <= height && h_idx >= 0 ; ++h_idx) {
for (int w_idx =ceil(wstart); w_idx <= floor(wend) && w_idx <= width && w_idx >= 0; ++w_idx) {
if (counter < 4) {
b_index[counter] = ((((roi_batch_ind * channels) + c) * height) + h_idx) * width + w_idx;
// b_index_curr[counter]= h_idx*width + w_idx;
//Normalize width and height to lie between -1 and 1
h_idx_n = static_cast<Dtype>( (static_cast<Dtype>(2)*(static_cast<Dtype>(h_idx) - roi_start_h) / (roi_end_h - roi_start_h)) - 1);
w_idx_n = static_cast<Dtype>((static_cast<Dtype>(2)*(static_cast<Dtype>(w_idx) - roi_start_w) / (roi_end_w - roi_start_w)) - 1);
h_idx_n = min(max(h_idx_n, static_cast<Dtype>(-1.0)),one);
w_idx_n = min(max(w_idx_n, static_cast<Dtype>(-1.0)),one);
multiplier[counter]= max(zero ,static_cast<Dtype>(1 - fabs(x_smp_n - w_idx_n))) * max(zero,static_cast<Dtype>(1 - fabs(y_smp_n - h_idx_n)));
//bisampled[smp/2] += multiplier[counter];
bisampled[smp/2] += bottom_data[ b_index[counter]] * multiplier[counter];
++counter;
} else {
goto stop;
}
} //w
}//h
stop:
if (bisampled[smp/2] > maxvalue) {
maxvalue = bisampled[smp/2];
//Using two loops to comply with c++ convention
for (int i=0; i<4;++i) {
maxidx[i] = b_index[i];
maxmult[i] = multiplier[i];
}
}
} //smp
//Store value in the top blob
top_data[index] = maxvalue;
for (int i = 0; i<4; ++i, ++argmax_index) {
argmax_idx[argmax_index] = maxidx[i];
argmax_mult[argmax_index] = maxmult[i];
}
}
}
template <typename Dtype>
void ROIAlignLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->gpu_data();
const Dtype* bottom_rois = bottom[1]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
int* argmax_idx = max_pts_.mutable_gpu_data();
Dtype* argmax_mult = max_mult_.mutable_gpu_data();
int count = top[0]->count();
LOG(INFO) << "Doing forward now";
// NOLINT_NEXT_LINE(whitespace/operators)
//Change CAFFE_CUDA_NUM_THREADS to 64
ROIAlignForward<Dtype><<<CAFFE_GET_BLOCKS(count), 32>>>(
count, bottom_data, spatial_scale_, channels_, height_, width_,
pooled_height_, pooled_width_, bottom_rois, top_data, argmax_idx, argmax_mult);
LOG(INFO) << "Done forward ";
CUDA_POST_KERNEL_CHECK;
}
template <typename Dtype>
__global__ void ROIAlignBackward(const int nthreads, const Dtype* top_diff,
const int* argmax_idx, const Dtype* argmax_mult, const int num_rois, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, Dtype* bottom_diff,
const Dtype* bottom_rois) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, h, w) coords in bottom data
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
Dtype gradient = 0.0;
// Accumulate gradient over all ROIs that pooled this element
for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
//const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
//int roi_batch_ind = offset_bottom_rois[0];
// Skip if ROI's batch index doesn't match n
// if (n != roi_batch_ind) {
// continue;
// }
const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Skip if ROI's batch index doesn't match n
if (n != roi_batch_ind) {
continue;
}
int roi_start_w = ceil(offset_bottom_rois[1] * spatial_scale);
int roi_start_h = ceil(offset_bottom_rois[2] * spatial_scale);
int roi_end_w = floor(offset_bottom_rois[3] * spatial_scale);
int roi_end_h = floor(offset_bottom_rois[4] * spatial_scale);
// Skip if ROI doesn't include (h, w)
const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
h >= roi_start_h && h <= roi_end_h);
if (!in_roi) {
continue;
}
int offset = (roi_n * channels + c) * pooled_height * pooled_width;
int argmax_offset = offset * 4;
const Dtype* offset_top_diff = top_diff + offset;
const int* offset_argmax_idx = argmax_idx + argmax_offset;
const Dtype* offset_argmax_mult = argmax_mult + argmax_offset;
// Util Vals
Dtype multiplier = 0.0;
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
for (int k = 0; k < 4; ++k) {
if (offset_argmax_idx[((ph * pooled_width + pw) * 4) + k] == index ) {
multiplier = offset_argmax_mult[( (ph * pooled_width + pw) * 4) + k];
gradient += offset_top_diff[ph * pooled_width + pw] * multiplier;
}
}
}//pw
}//ph
}//rois
bottom_diff[index] = gradient;
}
}
template <typename Dtype>
void ROIAlignLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (!propagate_down[0]) {
return;
}
const Dtype* bottom_rois = bottom[1]->gpu_data();
const Dtype* top_diff = top[0]->gpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
const int count = bottom[0]->count();
caffe_gpu_set(count, Dtype(0.), bottom_diff);
const int* argmax_idx = max_pts_.gpu_data();
const Dtype* argmax_mult = max_mult_.gpu_data();
// NOLINT_NEXT_LINE(whitespace/operators)
// CAFFE_CUDA_NUM_THREADS replaced with 64
LOG(INFO) << "Doing backward ";
ROIAlignBackward<Dtype><<<CAFFE_GET_BLOCKS(count), 16>>>(
count, top_diff, argmax_idx, argmax_mult, top[0]->num(), spatial_scale_, channels_,
height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
LOG(INFO) << "Done backward";
CUDA_POST_KERNEL_CHECK;
}
INSTANTIATE_LAYER_GPU_FUNCS(ROIAlignLayer);
} // namespace caffe
测试文件
测试文件是为了验证新定义的层在CPU和GPU下正向和反向传播是否正确,可选。一般每一个层都有对应的测试文件,放在./src/caffe/tes/路径下
test_roi_align_layer.cpp
// ------------------------------------------------------------------
// Fast R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Ross Girshick
// ------------------------------------------------------------------
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <vector>
#include "boost/scoped_ptr.hpp"
#include "gtest/gtest.h"
#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/filler.hpp"
#include "caffe/layers/roi_pooling_layer.hpp"
#include "caffe/test/test_caffe_main.hpp"
#include "caffe/test/test_gradient_check_util.hpp"
using boost::scoped_ptr;
namespace caffe {
typedef ::testing::Types<GPUDevice<float>, GPUDevice<double> > TestDtypesGPU;
template <typename TypeParam>
class ROIAlignLayerTest : public MultiDeviceTest<TypeParam> {
typedef typename TypeParam::Dtype Dtype;
protected:
ROIAlignLayerTest()
: blob_bottom_data_(new Blob<Dtype>(4, 3, 12, 8)),
blob_bottom_rois_(new Blob<Dtype>(4, 5, 1, 1)),
blob_top_data_(new Blob<Dtype>()) {
// fill the values
FillerParameter filler_param;
filler_param.set_std(10);
GaussianFiller<Dtype> filler(filler_param);
filler.Fill(this->blob_bottom_data_);
//for (int i = 0; i < blob_bottom_data_->count(); ++i) {
// blob_bottom_data_->mutable_cpu_data()[i] = i;
//}
blob_bottom_vec_.push_back(blob_bottom_data_);
int i = 0;
blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0; //caffe_rng_rand() % 4;
blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 1; // x1 < 8
blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1; // y1 < 12
blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 6; // x2 < 8
blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 6; // y2 < 12
i = 1;
blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 2;
blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 6; // x1 < 8
blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 2; // y1 < 12
blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 7; // x2 < 8
blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 11; // y2 < 12
i = 2;
blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 1;
blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3; // x1 < 8
blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1; // y1 < 12
blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 5; // x2 < 8
blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 10; // y2 < 12
i = 3;
blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3; // x1 < 8
blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 3; // y1 < 12
blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 3; // x2 < 8
blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 3; // y2 < 12
blob_bottom_vec_.push_back(blob_bottom_rois_);
blob_top_vec_.push_back(blob_top_data_);
}
virtual ~ROIAlignLayerTest() {
delete blob_bottom_data_;
delete blob_bottom_rois_;
delete blob_top_data_;
}
Blob<Dtype>* const blob_bottom_data_;
Blob<Dtype>* const blob_bottom_rois_;
Blob<Dtype>* const blob_top_data_;
vector<Blob<Dtype>*> blob_bottom_vec_;
vector<Blob<Dtype>*> blob_top_vec_;
};
TYPED_TEST_CASE(ROIAlignLayerTest, TestDtypesGPU);
TYPED_TEST(ROIAlignLayerTest, TestGradient) {
typedef typename TypeParam::Dtype Dtype;
LayerParameter layer_param;
ROIAlignParameter* roi_align_param =
layer_param.mutable_roi_pooling_param();
roi_align_param->set_pooled_h(6);
roi_align_param->set_pooled_w(6);
ROIAlignLayer<Dtype> layer(layer_param);
GradientChecker<Dtype> checker(1e-4, 1e-2);
checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
this->blob_top_vec_, 0);
}
} // namespace caffe
至此,需要添加完全自定义的文件就已经就位了,当然头文件和测试文件参照同类型的层写还是可以的。
修改文件
在./src/caffe/proto/caffe/proto文件夹中的caffe.proto文件中增加自定义层的相关参数和层的ID号
- 在caffe.proto文件中查找【message LayerParameter {···} 】并找到当前已有层的最大ID号, 不妨假设当前max_id=151,则在{···}末尾新增一行 optional LayernamexxParameter layernamexx_param=152;

- 在caffe.proto文件中添加可以在网络结构.prototxt文件中调用层时可定的参数
- unit32,float为参数类型;
- = 之后连接的是第几个参数,从1开始; - [default=] 连接的是参数的默认值;
//============================ added by tower =============================
// Message that stores parameters used by ROIAlignLayer
message ROIAlignParameter {
// Pad, kernel size, and stride are all given as a single value for equal
// dimensions in height and width or as Y, X pairs.
optional uint32 pooled_h = 1 [default = 7]; // The pooled output height
optional uint32 pooled_w = 2 [default = 7]; // The pooled output width
// Multiplicative spatial scale factor to translate ROI coords from their
// input scale to the scale used when pooling
optional float spatial_scale = 3 [default = 0.625];
}
//=========================================================================
至此,需要改动的地方就已经全部完成啦~
重新编译
首先切换到caffe根目录,cd ~/mask/faster-mask/caffe,使用命令清空编译的记录
[Yanglp@lenovo1 caffe]$ make clean
然后可以看到根目录下的隐藏文件夹./build_release被删除了,
接着先编译caffe根目录之外的lib文件夹
[Yanglp@lenovo1 caffe]$ cd ../lib
[Yanglp@lenovo1 lib]$ make
python setup.py build_ext --inplace
running build_ext
skipping 'utils/bbox.c' Cython extension (up-to-date)
skipping 'nms/cpu_nms.c' Cython extension (up-to-date)
skipping 'nms/gpu_nms.cpp' Cython extension (up-to-date)
skipping 'pycocotools/_mask.c' Cython extension (up-to-date)
rm -rf build
解决编译报错
编译lib完成后,再回到caffe目录下make,但此时编译会报错:
- 问题1:编译器版本不对

解决方法:切换g++和gcc版本,以下分别是查看编译器和切换版本的命令:
[Yanglp@lenovo1 lib]$ g++ --version
g++ (GCC) 4.4.7 20120313 (Red Hat 4.4.7-17)
Copyright (C) 2010 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
[Yanglp@lenovo1 lib]$ scl enable devtoolset-1.1 bash
[Yanglp@lenovo1 lib]$ g++ --version
g++ (GCC) 4.7.2 20121015 (Red Hat 4.7.2-5)
Copyright (C) 2012 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
[Yanglp@lenovo1 lib]$ cd ../caffe
- 问题2 :找不到caffe.pb.h文件
make: *** [.build_release/src/caffe/blob.o] Error 1 In file included from ./include/caffe/util/device_alternate.hpp:40:0, from ./include/caffe/common.hpp:19, from src/caffe/util/io.cpp:18: ./include/caffe/util/cudnn.hpp:8:34: fatal error: caffe/proto/caffe.pb.h: No such file or directory compilation terminated. make: *** [.build_release/src/caffe/util/io.o] Error 1 In file included from ./include/caffe/util/device_alternate.hpp:40:0, from ./include/caffe/common.hpp:19, from src/caffe/util/ma th_functions.cpp:6: ./include/caffe/util/cudnn.hpp:8:34: fatal error: caffe/proto/caffe.pb.h: No such file or directory compilation terminated.
解决方法:由于我们修改了./src/caffe/proto/caffe/proto/caffe.proto文件又make clean了,就需要自己生成对应的caffe.pb.h
[Yanglp@lenovo1 caffe]$ cd /share/manage/Yanglp/mask/faster-mask/caffe/src/caffe/proto/
[Yanglp@lenovo1 proto]$ protoc --cpp_out=/share/manage/Yanglp/mask/faster-mask/caffe/include/caffe/ caffe.proto
[Yanglp@lenovo1 proto]$ cd ../../../..
[Yanglp@lenovo1 caffe]$ find ./ -name "*caffe.pb.h*"
./include/caffe/caffe.pb.h
[Yanglp@lenovo1 caffe]$
使用protoc命令将caffe.proto文件生成对应的caffe.pb.h文件即可
再次编译
[Yanglp@lenovo1 caffe]$ make -j8 && make pycaffe
可以看到,自定义的层顺利通过编译
