!14671 [MSLITE][Develop] fix bug of arm cpu op resize_bicubic

From: @yangruoqi713 Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tong
2021-04-07 10:08:19 +08:00 · 2021-04-07 10:08:19 +08:00 · 2e1ba212b8
parent 4bafc6d94e e6644c1aa9
commit 2e1ba212b8
3 changed files with 107 additions and 133 deletions
--- a/mindspore/lite/nnacl/fp32/resize_fp32.c
+++ b/mindspore/lite/nnacl/fp32/resize_fp32.c
@ -26,12 +26,13 @@ void CalculateCoordinate(float out, int in, int *bottom, int *top, float *bottom
 }

 static void BicubicBaseFunc(float a, const float x, float *weight) {
-  if (x > 1 && x < 2) {
-    weight[0] = a * x * x * x - 5 * a * x * x + 8 * a * x - 4 * a;
-  } else if (x >= 0 && x <= 1) {
-    weight[0] = ((a + 2) * x - (a + 3)) * x * x + 1;
+  float abs_x = fabsf(x);
+  if (abs_x >= 0 && abs_x <= 1) {
+    *weight = ((a + 2) * abs_x - (a + 3)) * abs_x * abs_x + 1;
+  } else if (abs_x > 1 && abs_x <= 2) {
+    *weight = a * abs_x * abs_x * abs_x - 5 * a * abs_x * abs_x + 8 * a * abs_x - 4 * a;
  } else {
-    weight[0] = 0;
+    *weight = 0;
  }
 }

@ -41,18 +42,18 @@ static void BicubicBaseFunc(float a, const float x, float *weight) {
 //        { 0,                                                             otherwise
 // the value of 'a' depends on if is half_pixel_center(the scheme is the same as tf).
 // If is half pixel mode, a equals to -0.5, otherwise -0.75.
-void CalculateWightForBicubic(float out, int in, int *bottom, int *top, float *weights, float a) {
-  // can not exchange the order of calculating bottom[1] and bottom[0], because the order is decided outside.
-  bottom[1] = (int)(floorf(out));
-  bottom[0] = (bottom[1] - 1) < 0 ? 0 : (bottom[1] - 1);
-  top[0] = (bottom[1] + 1) < in ? (bottom[1] + 1) : (in - 1);
-  top[1] = (top[0] + 1) < in ? (top[0] + 1) : (in - 1);
+void CalculateWeightForBicubic(float out, int in, int *index, float *weights, float a) {
+  int floor_index = (int)(floorf(out));
+  index[0] = (floor_index - 1) < 0 ? 0 : (floor_index - 1);
+  index[1] = floor_index;
+  index[2] = (floor_index + 1) < in ? (floor_index + 1) : (in - 1);
+  index[3] = (floor_index + 2) < in ? (floor_index + 2) : (in - 1);

  // get positive value
-  float distance[4] = {1, 0, 1, 2};
-  float tmp_dis = out - (float)bottom[1];
-  distance[0] += tmp_dis;
-  distance[1] += tmp_dis;
+  float distance[4] = {-1, 0, 1, 2};
+  float tmp_dis = out - (float)floor_index;
+  distance[0] -= tmp_dis;
+  distance[1] -= tmp_dis;
  distance[2] -= tmp_dis;
  distance[3] -= tmp_dis;

@ -87,10 +88,9 @@ int PrepareResizeBilinear(const int *input_shape, const int *output_shape, Calcu
 }

 int PrepareResizeBicubic(const int *input_shape, const int *output_shape, CalculateOriginalCoordinate calculate,
-                         int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_weights, float *x_weights,
-                         float cubic_coeff) {
-  if (input_shape == NULL || output_shape == NULL || y_bottoms == NULL || y_tops == NULL || x_lefts == NULL ||
-      x_rights == NULL || y_weights == NULL || x_weights == NULL) {
+                         int *y_tops, int *x_lefts, float *y_weights, float *x_weights, float cubic_coeff) {
+  if (input_shape == NULL || output_shape == NULL || y_tops == NULL || x_lefts == NULL || y_weights == NULL ||
+      x_weights == NULL) {
    return NNACL_NULL_PTR;
  }

@ -101,11 +101,11 @@ int PrepareResizeBicubic(const int *input_shape, const int *output_shape, Calcul

  for (int h = 0; h < new_height; h++) {
    float actual_y = calculate(h, in_h, new_height);
-    CalculateWightForBicubic(actual_y, in_h, y_bottoms + 2 * h, y_tops + 2 * h, y_weights + 4 * h, cubic_coeff);
+    CalculateWeightForBicubic(actual_y, in_h, y_tops + 4 * h, y_weights + 4 * h, cubic_coeff);
  }
  for (int w = 0; w < new_width; w++) {
    float actual_x = calculate(w, in_w, new_width);
-    CalculateWightForBicubic(actual_x, in_w, x_lefts + 2 * w, x_rights + 2 * w, x_weights + 4 * w, cubic_coeff);
+    CalculateWeightForBicubic(actual_x, in_w, x_lefts + 4 * w, x_weights + 4 * w, cubic_coeff);
  }
  return NNACL_OK;
 }
@ -292,113 +292,94 @@ int ResizeBilinear(const float *input_data, float *output_data, const int *input
  return NNACL_OK;
 }

-void BicubicInterpRow(const float *src, float *dst, int len, const float *weights, const int *lefts, const int *rights,
-                      int in_c) {
-  int l = 0;
-  for (; l < len; l++) {
-    const float weight1 = weights[4 * l];
-    const float weight2 = weights[4 * l + 1];
-    const float weight3 = weights[4 * l + 2];
-    const float weight4 = weights[4 * l + 3];
+void BicubicInterpRow(const float *src, float *dst, const float *weights, const int *lefts, int width, int channel) {
+  for (int w = 0; w < width; w++) {
+    const float *weight = weights + 4 * w;
+    float *dst_w = dst + w * channel;
+    const float *src0_w = src + lefts[4 * w] * channel;
+    const float *src1_w = src + lefts[4 * w + 1] * channel;
+    const float *src2_w = src + lefts[4 * w + 2] * channel;
+    const float *src3_w = src + lefts[4 * w + 3] * channel;
    int c = 0;
 #ifdef ENABLE_NEON
-    float32x4_t weight1_vec = vdupq_n_f32(weight1);
-    float32x4_t weight2_vec = vdupq_n_f32(weight2);
-    float32x4_t weight3_vec = vdupq_n_f32(weight3);
-    float32x4_t weight4_vec = vdupq_n_f32(weight4);
+    float32x4_t weight0_vec = vdupq_n_f32(weight[0]);
+    float32x4_t weight1_vec = vdupq_n_f32(weight[1]);
+    float32x4_t weight2_vec = vdupq_n_f32(weight[2]);
+    float32x4_t weight3_vec = vdupq_n_f32(weight[3]);

-    for (; c <= in_c - 4; c += 4) {
-      float32x4_t src1_vec = vld1q_f32(src + lefts[2 * l] * in_c + c);
-      float32x4_t src2_vec = vld1q_f32(src + lefts[2 * l + 1] * in_c + c);
-      float32x4_t src3_vec = vld1q_f32(src + rights[2 * l] * in_c + c);
-      float32x4_t src4_vec = vld1q_f32(src + rights[2 * l + 1] * in_c + c);
+    for (; c <= channel - 4; c += 4) {
+      float32x4_t src0_vec = vld1q_f32(src0_w + c);
+      float32x4_t src1_vec = vld1q_f32(src1_w + c);
+      float32x4_t src2_vec = vld1q_f32(src2_w + c);
+      float32x4_t src3_vec = vld1q_f32(src3_w + c);

      float32x4_t interp_value =
-        src1_vec * weight1_vec + src2_vec * weight2_vec + src3_vec * weight3_vec + src4_vec * weight4_vec;
-      vst1q_f32(dst + l * in_c + c, interp_value);
+        src0_vec * weight0_vec + src1_vec * weight1_vec + src2_vec * weight2_vec + src3_vec * weight3_vec;
+      vst1q_f32(dst_w + c, interp_value);
    }
 #endif
-    int pos1 = lefts[2 * l] * in_c;
-    int pos2 = lefts[2 * l + 1] * in_c;
-    int pos3 = rights[2 * l] * in_c;
-    int pos4 = rights[2 * l + 1] * in_c;
-
-    for (; c < in_c; c++) {
-      float value1 = src[pos1 + c];
-      float value2 = src[pos2 + c];
-      float value3 = src[pos3 + c];
-      float value4 = src[pos4 + c];
-      dst[l * in_c + c] = value1 * weight1 + value2 * weight2 + value3 * weight3 + value4 * weight4;
+    for (; c < channel; c++) {
+      dst_w[c] = src0_w[c] * weight[0] + src1_w[c] * weight[1] + src2_w[c] * weight[2] + src3_w[c] * weight[3];
    }
  }
 }

-void BicubicInterpCol(const float *src1, const float *src2, const float *src3, const float *src4, float *dst, int len,
-                      const float *weights, int in_c) {
-  int l = 0;
-  for (; l < len; l++) {
+void BicubicInterpCol(const float *src, float *dst, const float *weights, int width, int channel) {
+  const float *src0 = src;
+  const float *src1 = src + width * channel;
+  const float *src2 = src + 2 * width * channel;
+  const float *src3 = src + 3 * width * channel;
+  for (int w = 0; w < width; w++) {
+    float *dst_w = dst + w * channel;
+    const float *src0_w = src0 + w * channel;
+    const float *src1_w = src1 + w * channel;
+    const float *src2_w = src2 + w * channel;
+    const float *src3_w = src3 + w * channel;
    int c = 0;
-    int l_stride = l * in_c;
-    const float weight1 = weights[4 * l];
-    const float weight2 = weights[4 * l + 1];
-    const float weight3 = weights[4 * l + 2];
-    const float weight4 = weights[4 * l + 3];
 #ifdef ENABLE_NEON
-    float32x4_t weight1_vec = vdupq_n_f32(weight1);
-    float32x4_t weight2_vec = vdupq_n_f32(weight2);
-    float32x4_t weight3_vec = vdupq_n_f32(weight3);
-    float32x4_t weight4_vec = vdupq_n_f32(weight4);
+    float32x4_t weight0_vec = vdupq_n_f32(weights[0]);
+    float32x4_t weight1_vec = vdupq_n_f32(weights[1]);
+    float32x4_t weight2_vec = vdupq_n_f32(weights[2]);
+    float32x4_t weight3_vec = vdupq_n_f32(weights[3]);

-    for (; c <= in_c - 4; c += 4) {
-      float32x4_t src1_vec = vld1q_f32(src1 + l_stride + c);
-      float32x4_t src2_vec = vld1q_f32(src2 + l_stride + c);
-      float32x4_t src3_vec = vld1q_f32(src3 + l_stride + c);
-      float32x4_t src4_vec = vld1q_f32(src4 + l_stride + c);
+    for (; c <= channel - 4; c += 4) {
+      float32x4_t src0_vec = vld1q_f32(src0_w + c);
+      float32x4_t src1_vec = vld1q_f32(src1_w + c);
+      float32x4_t src2_vec = vld1q_f32(src2_w + c);
+      float32x4_t src3_vec = vld1q_f32(src3_w + c);
      float32x4_t interp_value =
-        src1_vec * weight1_vec + src2_vec * weight2_vec + src3_vec * weight3_vec + src4_vec * weight4_vec;
-      vst1q_f32(dst + l_stride + c, interp_value);
+        src0_vec * weight0_vec + src1_vec * weight1_vec + src2_vec * weight2_vec + src3_vec * weight3_vec;
+      vst1q_f32(dst_w + c, interp_value);
    }
 #endif
-    for (; c < in_c; c++) {
-      float value1 = src1[l_stride + c];
-      float value2 = src2[l_stride + c];
-      float value3 = src3[l_stride + c];
-      float value4 = src4[l_stride + c];
-      dst[l_stride + c] = value1 * weight1 + value2 * weight2 + value3 * weight3 + value4 * weight4;
+    for (; c < channel; c++) {
+      dst_w[c] = src0_w[c] * weights[0] + src1_w[c] * weights[1] + src2_w[c] * weights[2] + src3_w[c] * weights[3];
    }
  }
 }

 void Bicubic(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
-             const int *y_bottom, const int *y_top, const int *x_lefts, const int *x_rights, const float *y_weights,
-             const float *x_weights, float *line_buffer, const int h_begin, const int h_end) {
+             const int *y_tops, const int *x_lefts, const float *y_weights, const float *x_weights, float *line_buffer,
+             const int h_begin, const int h_end) {
  int in_w = input_shape[2];
  int in_c = input_shape[3];
  int new_width = output_shape[2];
  int h_stride = new_width * in_c;

-  float *line_array[4] = {line_buffer, line_buffer + h_stride, line_buffer + 2 * h_stride, line_buffer + 3 * h_stride};
  for (int h = h_begin; h < h_end; h++) {
-    for (int i = 0; i < 2; ++i) {
-      BicubicInterpRow(input_data + y_bottom[2 * h + i] * in_w * in_c, line_array[i], new_width, x_weights, x_lefts,
-                       x_rights, in_c);
+    for (int i = 0; i < 4; ++i) {
+      BicubicInterpRow(input_data + y_tops[4 * h + i] * in_w * in_c, line_buffer + i * h_stride, x_weights, x_lefts,
+                       new_width, in_c);
    }
-    for (int j = 0; j < 2; ++j) {
-      BicubicInterpRow(input_data + y_top[2 * h + j] * in_w * in_c, line_array[j + 2], new_width, x_weights, x_lefts,
-                       x_rights, in_c);
-    }
-
-    BicubicInterpCol(line_array[0], line_array[1], line_array[2], line_array[3], output_data + h * h_stride, new_width,
-                     y_weights, in_c);
+    BicubicInterpCol(line_buffer, output_data + h * h_stride, y_weights + 4 * h, new_width, in_c);
  }
 }

 int ResizeBicubic(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
-                  const int *y_bottoms, const int *y_tops, const int *x_lefts, const int *x_rights,
-                  const float *y_weights, const float *x_weights, float *line_buffer, const int h_begin,
-                  const int h_end) {
-  if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL || y_bottoms == NULL ||
-      y_tops == NULL || x_lefts == NULL || x_rights == NULL || y_weights == NULL || x_weights == NULL) {
+                  const int *y_tops, const int *x_lefts, const float *y_weights, const float *x_weights,
+                  float *line_buffer, const int h_begin, const int h_end) {
+  if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL || y_tops == NULL ||
+      x_lefts == NULL || y_weights == NULL || x_weights == NULL) {
    return NNACL_NULL_PTR;
  }
  int input_cube_per_batch = input_shape[1] * input_shape[2] * input_shape[3];
@ -406,8 +387,8 @@ int ResizeBicubic(const float *input_data, float *output_data, const int *input_
  for (int b = 0; b < input_shape[0]; b++) {
    const float *input = input_data + b * input_cube_per_batch;
    float *output = output_data + b * output_cube_per_batch;
-    Bicubic(input, output, input_shape, output_shape, y_bottoms, y_tops, x_lefts, x_rights, y_weights, x_weights,
-            line_buffer, h_begin, h_end);
+    Bicubic(input, output, input_shape, output_shape, y_tops, x_lefts, y_weights, x_weights, line_buffer, h_begin,
+            h_end);
  }
  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp32/resize_fp32.h
+++ b/mindspore/lite/nnacl/fp32/resize_fp32.h
@ -32,8 +32,7 @@ int PrepareResizeBilinear(const int *input_shape, const int *output_shape, Calcu
                          float *x_left_weights);

 int PrepareResizeBicubic(const int *input_shape, const int *output_shape, CalculateOriginalCoordinate calculate,
-                         int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
-                         float *x_left_weights, float cubic_coeff);
+                         int *y_tops, int *x_lefts, float *y_weights, float *x_weights, float cubic_coeff);

 int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
                   const int *y_bottoms, const int *y_tops, const int *x_lefts, const int *x_rights,
@ -41,9 +40,8 @@ int ResizeBilinear(const float *input_data, float *output_data, const int *input
                   const int h_begin, const int h_end);

 int ResizeBicubic(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
-                  const int *y_bottoms, const int *y_tops, const int *x_lefts, const int *x_rights,
-                  const float *y_bottom_weights, const float *x_left_weights, float *line_buffer, const int h_begin,
-                  const int h_end);
+                  const int *y_tops, const int *x_lefts, const float *y_weights, const float *x_weights,
+                  float *line_buffer, const int h_begin, const int h_end);

 int PrepareCropAndResizeBilinear(const int *input_shape, const float *boxes, const int *box_idx,
                                 const int *output_shape, int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights,
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/resize_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/resize_fp32.cc
@ -84,28 +84,26 @@ int ResizeCPUKernel::ReSize() {
 // the calculation.
 void ResizeCPUKernel::CalTmpBufferLen(int *x_len, int *y_len, int *x_weight_len, int *y_weight_len) {
  if (method_ == static_cast<int>(schema::ResizeMethod_LINEAR)) {
-    *x_len = *x_weight_len = new_width_;
-    *y_len = *y_weight_len = new_height_;
+    *x_len = new_width_;
+    *y_len = new_height_;
+    *x_weight_len = new_width_;
+    *y_weight_len = new_height_;
  }
  if (method_ == static_cast<int>(schema::ResizeMethod_CUBIC)) {
-    *x_len = new_width_ * 2;
-    *y_len = new_height_ * 2;
+    *x_len = new_width_ * 4;
+    *y_len = new_height_ * 4;
    *x_weight_len = new_width_ * 4;
    *y_weight_len = new_height_ * 4;
  }
 }

-// If resize method is bicubic, x_lefts_ array stores two elements (index - 1, index - 2) for every output coordinate
-// index. For example, there is a 1-D output coordinate array:
-// [0, 0.5, 1]
-// now, search two elements at left and two at right for every position in output array.
-// Thus, x_lefts_ array looks like :
-//               x_lefts_ [-2,  -1, -1.5, -0.5, -1,  0]
-//                          \   /      \   /     \  /
-//                           \ /        \ /       \/
-// corresponding to index :   0        0.5        1
-// Apply to x_rights_ array by the same way.
+// If resize method is bicubic, x_lefts_ array stores four elements (index - 1, index, index + 1, index + 2) for every
+// output coordinate index.
 int ResizeCPUKernel::MallocTmpBuffer() {
+  if (method_ != static_cast<int>(schema::ResizeMethod_LINEAR) &&
+      method_ != static_cast<int>(schema::ResizeMethod_CUBIC)) {
+    return RET_OK;
+  }
  // make sure y_bottoms_, y_tops_, etc. are null before malloc
  FreeTmpBuffer();

@ -116,12 +114,14 @@ int ResizeCPUKernel::MallocTmpBuffer() {
  {
    coordinate_.x_lefts_ = reinterpret_cast<int *>(malloc(sizeof(int) * x_len));
    CHECK_MALLOC_RES(coordinate_.x_lefts_, RET_NULL_PTR)
-    coordinate_.x_rights_ = reinterpret_cast<int *>(malloc(sizeof(int) * x_len));
-    CHECK_MALLOC_RES(coordinate_.x_rights_, RET_NULL_PTR)
    coordinate_.y_tops_ = reinterpret_cast<int *>(malloc(sizeof(int) * y_len));
    CHECK_MALLOC_RES(coordinate_.y_tops_, RET_NULL_PTR)
-    coordinate_.y_bottoms_ = reinterpret_cast<int *>(malloc(sizeof(int) * y_len));
-    CHECK_MALLOC_RES(coordinate_.y_bottoms_, RET_NULL_PTR)
+    if (method_ == static_cast<int>(schema::ResizeMethod_LINEAR)) {
+      coordinate_.x_rights_ = reinterpret_cast<int *>(malloc(sizeof(int) * x_len));
+      CHECK_MALLOC_RES(coordinate_.x_rights_, RET_NULL_PTR)
+      coordinate_.y_bottoms_ = reinterpret_cast<int *>(malloc(sizeof(int) * y_len));
+      CHECK_MALLOC_RES(coordinate_.y_bottoms_, RET_NULL_PTR)
+    }
  }

  // malloc memory for weights of x, y axes
@ -175,12 +175,12 @@ int ResizeCPUKernel::RunImpl(int task_id) {
  MSLITE_CHECK_PTR(output_data);

  auto input_shape = input->shape();
+  int unit = UP_DIV(new_height_, context_->thread_num_);
+  int h_begin = unit * task_id;
+  int h_end = std::min(h_begin + unit, new_height_);
+  int c = input_shape.at(3);
  switch (method_) {
    case static_cast<int>(schema::ResizeMethod_LINEAR): {
-      int unit = UP_DIV(new_height_, context_->thread_num_);
-      int h_begin = unit * task_id;
-      int h_end = std::min(h_begin + unit, new_height_);
-      int c = in_tensors_.at(0)->shape().at(3);
      float *line0 = line_buffer_ + new_width_ * c * 2 * task_id;
      float *line1 = line0 + new_width_ * c;
      return ResizeBilinear(input_data, output_data, input_shape.data(), out_tensors_.at(0)->shape().data(),
@ -192,14 +192,10 @@ int ResizeCPUKernel::RunImpl(int task_id) {
                                   calculate_, coordinate_transform_mode_, task_id, context_->thread_num_);
    }
    case static_cast<int>(schema::ResizeMethod_CUBIC): {
-      int unit = UP_DIV(new_height_, context_->thread_num_);
-      int h_begin = unit * task_id;
-      int h_end = std::min(h_begin + unit, new_height_);
-      int c = in_tensors_.at(0)->Channel();
      float *line_buffer = line_buffer_ + new_width_ * c * 4 * task_id;
      return ResizeBicubic(input_data, output_data, input_shape.data(), out_tensors_.at(0)->shape().data(),
-                           coordinate_.y_bottoms_, coordinate_.y_tops_, coordinate_.x_lefts_, coordinate_.x_rights_,
-                           y_weights_, x_weights_, line_buffer, h_begin, h_end);
+                           coordinate_.y_tops_, coordinate_.x_lefts_, y_weights_, x_weights_, line_buffer, h_begin,
+                           h_end);
    }
    default: {
      MS_LOG(ERROR) << "Resize unknown method " << method_;
@ -227,9 +223,8 @@ int ResizeCPUKernel::ResizePrepare() {
  }
  if (method_ == static_cast<int>(schema::ResizeMethod_CUBIC)) {
    auto cubic_coeff = reinterpret_cast<ResizeParameter *>(op_parameter_)->cubic_coeff_;
-    return PrepareResizeBicubic(input_shape.data(), out_tensors_.at(0)->shape().data(), calculate_,
-                                coordinate_.y_bottoms_, coordinate_.y_tops_, coordinate_.x_lefts_,
-                                coordinate_.x_rights_, y_weights_, x_weights_, cubic_coeff);
+    return PrepareResizeBicubic(input_shape.data(), out_tensors_.at(0)->shape().data(), calculate_, coordinate_.y_tops_,
+                                coordinate_.x_lefts_, y_weights_, x_weights_, cubic_coeff);
  }
  return RET_OK;
 }