可变形卷积原理、实现及工程化部署

引言

本文主要以目标检测算法CenterNet作为载体,介绍可变形卷积的算法原理,python实现以及工程化部署

可变形卷积原理介绍

可变形卷积目前有DCN V1以及在V1基础上改进发展来的DCN V2,具体的算法原理可以看论文原文或者参考:

https://cloud.tencent.com/developer/article/1679389

可变形卷积的实现

这里主要参考:https://cloud.tencent.com/developer/article/1638363 中的讲解

CenterNet中实现DCN V2的工程化部署

这里主要参考的代码为:https://github.com/CaoWGG/TensorRT-CenterNet

这里参考:

有几个点需要事先说明:

  • 为了能够实现对于模型的量化,需要将CenterNet模型先转换为ONNX,然后利用tensorRT官方公布的onnx-tensorrt库来实现tensorRT模型的转化

  • 由于TensorRT中不支持可变形卷积的操作,所以需要自定义plugin来进行实现

CenterNet中DCNV2的plugin定义在onnx-tensorrt文件夹中,分别为:

dcn_v2_im2col_cuda.cu和dcn_v2_im2col_cuda.h;DCNv2.cpp和DCNv2.h,其中DCNv2.cpp和DCNv2.h为tensorRT中自定义plugin的文件,具体代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
DCNv2Plugin::DCNv2Plugin(int in_channel,
int out_channel,
int kernel_H,
int kernel_W,
int deformable_group,
int dilation,
int groups,
int padding,
int stride,
nvinfer1::Weights const &weight, nvinfer1::Weights const &bias):_in_channel(in_channel),
_out_channel(out_channel),_kernel_H(kernel_H),_kernel_W(kernel_W),_deformable_group(deformable_group),
_dilation(dilation),_groups(groups),_padding(padding),_stride(stride),_initialized(false){

if (weight.type == nvinfer1::DataType::kFLOAT)
{
_h_weight.assign((float*)weight.values,(float*)weight.values+weight.count);
} else { throw std::runtime_error("Unsupported weight dtype");}

if (bias.type == nvinfer1::DataType::kFLOAT)
{
_h_bias.assign((float*)bias.values,(float*)bias.values+bias.count);
} else { throw std::runtime_error("Unsupported bias dtype");}

}

// 初始化函数,为参数提前开辟显存空间
int DCNv2Plugin::initialize() {
if(_initialized) return 0;
auto _output_dims = this->getOutputDimensions(0, &this->getInputDims(0), 3);
assert(is_CHW(this->getInputDims(0)));
assert(is_CHW(_output_dims));
size_t ones_size = _output_dims.d[1]*_output_dims.d[2]* sizeof(float);
size_t weight_size = _h_weight.size()* sizeof(float);
size_t bias_size = _h_bias.size()* sizeof(float);
float *ones_cpu = new float[ones_size/ sizeof(float)];
for (int i = 0; i < ones_size/ sizeof(float); i++) {
ones_cpu[i] = 1.0;
}
CHECK_CUDA(cudaMalloc((void**)&_d_columns, _in_channel * _kernel_H * _kernel_W * ones_size););
CHECK_CUDA(cudaMalloc((void**)&_d_ones, ones_size));
CHECK_CUDA(cudaMalloc((void**)&_d_weight, weight_size));
CHECK_CUDA(cudaMalloc((void**)&_d_bias, bias_size));
CHECK_CUDA(cudaMemcpy(_d_ones, ones_cpu, ones_size, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(_d_weight, _h_weight.data(), weight_size, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(_d_bias, _h_bias.data(), bias_size, cudaMemcpyHostToDevice));
delete[] ones_cpu;
_initialized = true;

return 0;
}

// 用于释放之前申请的显存空间
void DCNv2Plugin::terminate() {
if (!_initialized) {
return;
}
cudaFree(_d_columns);
cudaFree(_d_bias);
cudaFree(_d_weight);
cudaFree(_d_ones);
_initialized = false;
}

DCNv2Plugin::~DCNv2Plugin() {
terminate();
}

// 判断数据类型是否正确
bool DCNv2Plugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {

return (type == nvinfer1::DataType::kFLOAT);
}

// TensorRT支持动态tensor大小的时候,batch的维度需要用下面的函数定义
nvinfer1::Dims DCNv2Plugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) {
assert(index == 0);
assert(inputDims);
assert(nbInputs == 3);
nvinfer1::Dims const& input = inputDims[0];
assert(is_CHW(input));
nvinfer1::Dims output;
output.nbDims = input.nbDims;
for( int d=0; d<input.nbDims; ++d ) {
output.type[d] = input.type[d];
output.d[d] = input.d[d];
}
output.d[0] = _out_channel;
output.d[1] = (output.d[1] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
output.d[2] = (output.d[2] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
return output;
}

// 返回该plugin需要中间显存变量的实际数据大小
size_t DCNv2Plugin::getWorkspaceSize(int maxBatchSize) const {
return 0;
}

// 该plugin定义的op的执行函数
int DCNv2Plugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
cudaStream_t stream) {
float alpha ,beta;
int m, n, k;

cublasHandle_t handle = blas_handle();
const float* input = static_cast<const float *>(inputs[0]);
const float* offset = static_cast<const float *>(inputs[1]);
const float* mask = static_cast<const float *>(inputs[2]);
float * output = static_cast<float *>(outputs[0]);
nvinfer1::Dims input_dims = this->getInputDims(0);
assert(batchSize==1);
int h = input_dims.d[1];
int w = input_dims.d[2];
int height_out = (h + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1;
int width_out = (w + 2 * _padding - (_dilation * (_kernel_W - 1) + 1)) / _stride + 1;
m = _out_channel;
n = height_out * width_out;
k = 1;
alpha = 1.0;
beta = 0.0;
/// output nxm
/// ones 1xn T ->> nx1
/// bias 1xm
/// ones x bias = nxm
// add bias
cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
n, m, k,&alpha,
_d_ones, k,
_d_bias, k,&beta,
output, n);
// im2col (offset and mask)
modulated_deformable_im2col_cuda(stream,input,offset,mask,
1, _in_channel, h, w,
height_out, width_out, _kernel_H, _kernel_W,
_padding, _padding, _stride, _stride, _dilation, _dilation,
_deformable_group, _d_columns);
m = _out_channel;
n = height_out * width_out;
k = _in_channel * _kernel_H * _kernel_W;
alpha = 1.0;
beta = 1.0;
// im2col conv
cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
n, m, k,&alpha,
_d_columns, n,
_d_weight, k,
&beta,
output, n);
return 0;
}

然后需要将以上plugin定义好的op在onnx-tensorrt中builtin_op_importers.cpp文件中进行插件的注册操作:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
DEFINE_BUILTIN_OP_IMPORTER(DCNv2) {
ASSERT(inputs.at(0).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // input
ASSERT(inputs.at(1).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // offset
ASSERT(inputs.at(2).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // mask
ASSERT(inputs.at(3).is_weights(), ErrorCode::kUNSUPPORTED_NODE); // weight

auto kernel_weights = inputs.at(3).weights();
nvinfer1::Weights bias_weights;
if( inputs.size() == 5 ) {
ASSERT(inputs.at(4).is_weights(), ErrorCode::kUNSUPPORTED_NODE);
auto shaped_bias_weights = inputs.at(4).weights();
ASSERT(shaped_bias_weights.shape.nbDims == 1, ErrorCode::kINVALID_NODE);
ASSERT(shaped_bias_weights.shape.d[0] == kernel_weights.shape.d[0], ErrorCode::kINVALID_NODE);
bias_weights = shaped_bias_weights;
} else {
bias_weights = ShapedWeights::empty(kernel_weights.type);
}
int out_channel,in_channel,kernel_H,kernel_W,deformable_group,dilation,groups,padding,stride;
out_channel = kernel_weights.shape.d[0];
in_channel = kernel_weights.shape.d[1];
kernel_H = kernel_weights.shape.d[2];
kernel_W = kernel_weights.shape.d[3];

OnnxAttrs attrs(node);
deformable_group = attrs.get("deformable_group", 1);
dilation = attrs.get("dilation", 1);
groups = attrs.get("groups", 1);
padding = attrs.get("padding", 1);
stride = attrs.get("stride", 1);
RETURN_FIRST_OUTPUT(
ctx->addPlugin(
new DCNv2Plugin(in_channel,out_channel,kernel_H,kernel_W,deformable_group,
dilation,groups,padding,stride, kernel_weights, bias_weights),
{&inputs.at(0).tensor(),&inputs.at(1).tensor(),&inputs.at(2).tensor()}));
}

在builtin_plugins.cpp也进行注册:

1
REGISTER_BUILTIN_PLUGIN("DCNv2", DCNv2Plugin);

然后需要在onnx-tensorrt中的CMakeLists.txt里添加上定义plugin对应的源码并将其链接到动态库中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 定义插件源码
set(PLUGIN_SOURCES
FancyActivation.cu
ResizeNearest.cu
Split.cu
dcn_v2_im2col_cuda.cu
InstanceNormalization.cpp
DCNv2.cpp
plugin.cpp
)

# 链接到动态库
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC --expt-extended-lambda")
if(${CMAKE_VERSION} VERSION_LESS ${CMAKE_VERSION_THRESHOLD})
CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
CUDA_ADD_LIBRARY(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
else()
include_directories(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
add_library(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
endif()
target_include_directories(nvonnxparser_plugin PUBLIC ${CUDA_INCLUDE_DIRS} ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
target_link_libraries(nvonnxparser_plugin ${TENSORRT_LIBRARY} cuda cudart cublas)

最后在链接以上生成的库来进行tensort标准C++ API函数进行前向推理以及模型转换操作

-------------本文结束感谢您的阅读-------------