#if 0 void convert_neon(struct image_in const * __restrict__ input, struct image_out const * __restrict__ output) #endif { do_prefetch(&in[-1*stride - 1], 0); do_prefetch(&in[ 0*stride - 1], 0); do_prefetch(&in[+1*stride - 1], (rows_per_loop) < 3 ? 3 : 0); do_prefetch(&in[+2*stride - 1], (rows_per_loop) < 3 ? 3 : 0); if (rows_per_loop > 2) { do_prefetch(&in[+3*stride - 1], 0); do_prefetch(&in[+4*stride - 1], 3); } do { unsigned int const offs_r = offsetof(rgb_pixel_t, r); unsigned int const offs_g = offsetof(rgb_pixel_t, g); unsigned int const offs_b = offsetof(rgb_pixel_t, b); uint8x8x2_t rowp = vld2_u8(&in[-1*stride - 1]); uint8x8x2_t row0 = vld2_u8(&in[ 0*stride - 1]); uint8x8x2_t row1 = vld2_u8(&in[+1*stride - 1]); convert_fn0(&(out[0*out_stride]), offs_r, offs_g, offs_b, rowp.val[0], rowp.val[1], row0.val[0], row0.val[1], row1.val[0], row1.val[1]); uint8x8x2_t row2 = vld2_u8(&in[+2*stride - 1]); convert_fn1(&(out[1*out_stride]), offs_r, offs_g, offs_b, row0.val[0], row0.val[1], row1.val[0], row1.val[1], row2.val[0], row2.val[1]); if (rows_per_loop < 3) break; uint8x8x2_t row3 = vld2_u8(&in[+3*stride - 1]); convert_fn0(&(out[2*out_stride]), offs_r, offs_g, offs_b, row1.val[0], row1.val[1], row2.val[0], row2.val[1], row3.val[0], row3.val[1]); uint8x8x2_t row4 = vld2_u8(&in[+4*stride - 1]); convert_fn1(&(out[3*out_stride]), offs_r, offs_g, offs_b, row2.val[0], row2.val[1], row3.val[0], row3.val[1], row4.val[0], row4.val[1]); } while (0); }