G2D图像处理硬件调用和测试-基于米尔-全志T113-i开发板

时间：2024-04-08 22:07:09

本篇测评由电子工程天下的优异测评者“jf_99374259”供应。

本文将先容基于米尔电子MYD-YT113i的G2D挪用和测试。

MYC-YT113i及开发板

真正的国产焦点板，100%国产物料认证
国产T113-i配备2*Cortex-A7@1.2GHz ，
外置接口、支撑、HiFi4
接口厚实：视频收集接口、接口、接口、CAN 接口、千兆接口
工业级：-40℃~+85℃、尺寸37mm*39mm
邮票孔+LGA，140+50PIN

全志 T113-i 2D图形加快硬件支撑情形

Supports layer size up to 2048 x 2048 pixels
Supports pre-multiply alpha image data
Supports color key
Supports two pipes Porter-Duff alpha nding
Supports multiple video formats 4:2:0, 4:2:2, 4:1:1 and multiple pixel formats (8/16/24/32 bits graphicslayer)
Supports memory scan order option
Supports any format convert function
Supports 1/16× to 32× resize ratio
Supports 32-phase 8-tap horizontal anti-alias filter and 32-phase 4-tap vertical anti-alias filter
Supports window clip
Supports FillRectangle, BitBlit, StretchBlit and MaskBlit
Supports horizontal and vertical flip, clockwise 0/90/180/270 degree rotate for normal buffer
Supports horizontal flip, clockwise 0/90/270 degree rotate for LBC buffer

能够看到 g2d 硬件支撑相当多的2D图象处置，包孕色彩空间转换，分辨率缩放，图层叠加，扭转等

开辟环境设置

根底开辟环境搭建参考上上上一篇

https://bbs.elecfans.com/jishu_2408808_1_1.html

除了对象链外，咱们应用 opencv-mobile 加载输出图片和保管效果，用来检察色彩转换是不是失常

g2d硬件间接接纳规范的 ioctl 控制，只需要引入相干界说即可，无需链接so

https://github.com/MYIR-ALLWINNER/framework/blob/develop-yt113-framework/auto/sdk_lib/include/g2d_driver.h

另外，g2d的输出和输入数据必须在dmaion buffer上，是以还需要dmaion.h头文件，用来调配和开释dmaion buffer

https://github.com/MYIR-ALLWINNER/framework/blob/develop-yt113-framework/auto/sdk_lib/include/DmaIon.h

基于C言语完成的YUV转RGB

这里复用以前T113-i JPG解码的函数

void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
{
const unsigned char* yptr = yuv420sp;
const unsigned char* vuptr = yuv420sp + w * h;

for (int y = 0; y < h; y += 2)
{
const unsigned char* yptr0 = yptr;
const unsigned char* yptr1 = yptr + w;
unsigned char* rgb0 = rgb;
unsigned char* rgb1 = rgb + w * 3;

int remain = w;

#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
for (; remain > 0; remain -= 2)
{
// R = 1.164 * yy + 1.596 * vv
// G = 1.164 * yy - 0.813 * vv - 0.391 * uu
// B = 1.164 * yy + 2.018 * uu

// R = Y + (1.370705 * (V-128))
// G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
// B = Y + (1.732446 * (U-128))

// R = ((Y << 6) + 87.72512 * (V-128)) >> 6
// G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
// B = ((Y << 6) + 110.876544 * (U-128)) >> 6

// R = ((Y << 6) + 90 * (V-128)) >> 6
// G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
// B = ((Y << 6) + 113 * (U-128)) >> 6

// R = (yy + 90 * vv) >> 6
// G = (yy - 46 * vv - 22 * uu) >> 6
// B = (yy + 113 * uu) >> 6

int v = vuptr[0] - 128;
int u = vuptr[1] - 128;

int ruv = 90 * v;
int guv = -46 * v + -22 * u;
int buv = 113 * u;

int y00 = yptr0[0] << 6;
rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);

int y01 = yptr0[1] << 6;
rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);

int y10 = yptr1[0] << 6;
rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);

int y11 = yptr1[1] << 6;
rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);

yptr0 += 2;
yptr1 += 2;
vuptr += 2;
rgb0 += 6;
rgb1 += 6;
}
#undef SATURATE_CAST_UCHAR

yptr += 2 * w;
rgb += 2 * 3 * w;
}
}

基于ARM neon优化的YUV转RGB

考虑到armv7主动neon优化才能较差，这里针对性的编写 arm neon inline assembly完成YUV2RGB内核部份，达到最优化功能，榨干cpu功能

void yuv420sp2rgb_neon(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
{
const unsigned char* yptr = yuv420sp;
const unsigned char* vuptr = yuv420sp + w * h;

#if __ARM_NEON
uint8x8_t _v128 = vdup_n_u8(128);
int8x8_t _v90 = vdup_n_s8(90);
int8x8_t _v46 = vdup_n_s8(46);
int8x8_t _v22 = vdup_n_s8(22);
int8x8_t _v113 = vdup_n_s8(113);
#endif // __ARM_NEON

for (int y = 0; y < h; y += 2)
{
const unsigned char* yptr0 = yptr;
const unsigned char* yptr1 = yptr + w;
unsigned char* rgb0 = rgb;
unsigned char* rgb1 = rgb + w * 3;

#if __ARM_NEON
int nn = w >> 3;
int remain = w - (nn << 3);
#else
int remain = w;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn > 0; nn--)
{
int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));

int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128));
int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
int8x8_t _vv = _vvvvuuuu.val[0];
int8x8_t _uu = _vvvvuuuu.val[1];

int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
_g0 = vmlsl_s8(_g0, _uu, _v22);
int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);

int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
_g1 = vmlsl_s8(_g1, _uu, _v22);
int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);

uint8x8x3_t _rgb0;
_rgb0.val[0] = vqshrun_n_s16(_r0, 6);
_rgb0.val[1] = vqshrun_n_s16(_g0, 6);
_rgb0.val[2] = vqshrun_n_s16(_b0, 6);

uint8x8x3_t _rgb1;
_rgb1.val[0] = vqshrun_n_s16(_r1, 6);
_rgb1.val[1] = vqshrun_n_s16(_g1, 6);
_rgb1.val[2] = vqshrun_n_s16(_b1, 6);

vst3_u8(rgb0, _rgb0);
vst3_u8(rgb1, _rgb1);

yptr0 += 8;
yptr1 += 8;
vuptr += 8;
rgb0 += 24;
rgb1 += 24;
}
#else
if (nn > 0)
{
asm volatile(
"0: n"
"pld [%3, #128] n"
"vld1.u8 {d2}, [%3]! n"
"vsub.s8 d2, d2, %12 n"
"pld [%1, #128] n"
"vld1.u8 {d0}, [%1]! n"
"pld [%2, #128] n"
"vld1.u8 {d1}, [%2]! n"
"vshll.u8 q2, d0, #6 n"
"vorr d3, d2, d2 n"
"vshll.u8 q3, d1, #6 n"
"vorr q9, q2, q2 n"
"vtrn.s8 d2, d3 n"
"vorr q11, q3, q3 n"
"vmlsl.s8 q9, d2, %14 n"
"vorr q8, q2, q2 n"
"vmlsl.s8 q11, d2, %14 n"
"vorr q10, q3, q3 n"
"vmlal.s8 q8, d2, %13 n"
"vmlal.s8 q2, d3, %16 n"
"vmlal.s8 q10, d2, %13 n"
"vmlsl.s8 q9, d3, %15 n"
"vmlal.s8 q3, d3, %16 n"
"vmlsl.s8 q11, d3, %15 n"
"vqshrun.s16 d24, q8, #6 n"
"vqshrun.s16 d26, q2, #6 n"
"vqshrun.s16 d4, q10, #6 n"
"vqshrun.s16 d25, q9, #6 n"
"vqshrun.s16 d6, q3, #6 n"
"vqshrun.s16 d5, q11, #6 n"
"subs %0, #1 n"
"vst3.u8 {d24-d26}, [%4]! n"
"vst3.u8 {d4-d6}, [%5]! n"
"bne 0b n"
: "=r"(nn), // %0
"=r"(yptr0), // %1
"=r"(yptr1), // %2
"=r"(vuptr), // %3
"=r"(rgb0), // %4
"=r"(rgb1) // %5
: "0"(nn),
"1"(yptr0),
"2"(yptr1),
"3"(vuptr),
"4"(rgb0),
"5"(rgb1),
"w"(_v128), // %12
"w"(_v90), // %13
"w"(_v46), // %14
"w"(_v22), // %15
"w"(_v113) // %16
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
}
#endif // __aarch64__
#endif // __ARM_NEON

// R = Y + (1.370705 * (V-128))
// G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
// B = Y + (1.732446 * (U-128))

// R = ((Y << 6) + 87.72512 * (V-128)) >> 6
// G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
// B = ((Y << 6) + 110.876544 * (U-128)) >> 6

// R = ((Y << 6) + 90 * (V-128)) >> 6
// G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
// B = ((Y << 6) + 113 * (U-128)) >> 6

// R = (yy + 90 * vv) >> 6
// G = (yy - 46 * vv - 22 * uu) >> 6
// B = (yy + 113 * uu) >> 6

int v = vuptr[0] - 128;
int u = vuptr[1] - 128;

int ruv = 90 * v;
int guv = -46 * v + -22 * u;
int buv = 113 * u;

int y00 = yptr0[0] << 6;
rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);

int y01 = yptr0[1] << 6;
rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);

int y10 = yptr1[0] << 6;
rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);

int y11 = yptr1[1] << 6;
rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);

yptr0 += 2;
yptr1 += 2;
vuptr += 2;
rgb0 += 6;
rgb1 += 6;
}
#undef SATURATE_CAST_UCHAR

yptr += 2 * w;
rgb += 2 * 3 * w;
}
}

基于G2D图形硬件的YUV转RGB

https://github.com/MYIR-ALLWINNER/framework/blob/develop-yt113-framework/auto/sdk_lib/sdk_memory/DmaIon.cpp

这里贴的代码省略非常谬误处置的逻辑，有个坑是 linux-4.9 和 linux-5.4 用法不一样，米尔电子的这个T113-i体系是linux-5.4以是不兼容4.9内核的ioctl用法习性

struct ion_memory
{
size_t size;
int fd;
void* virt_addr;
unsigned int phy_addr;
};

class ion_allocator
{
public:
ion_allocator();
~ion_allocator();

int open();
void close();

int alloc(size_t size, struct ion_memory* mem);
int free(struct ion_memory* mem);

int flush(struct ion_memory* mem);

public:
int ion_fd;
int cedar_fd;
};

ion_allocator::ion_allocator()
{
ion_fd = -1;
cedar_fd = -1;
}

ion_allocator::~ion_allocator()
{
close();
}

int ion_allocator::open()
{
close();

ion_fd = ::open("/dev/ion", O_RDWR);
cedar_fd = ::open("/dev/cedar_dev", O_RDONLY);

ioctl(cedar_fd, IOCTL_ENGINE_REQ, 0);

return 0;
}

void ion_allocator::close()
{
if (cedar_fd != -1)
{
ioctl(cedar_fd, IOCTL_ENGINE_REL, 0);
::close(cedar_fd);
cedar_fd = -1;
}

if (ion_fd != -1)
{
::close(ion_fd);
ion_fd = -1;
}
}

int ion_allocator::alloc(size_t size, struct ion_memory* mem)
{
struct aw_ion_new_alloc_data alloc_data;
alloc_data.len = size;
alloc_data.heap_id_mask = AW_ION_SYSTEM_HEAP_MASK;
alloc_data.flags = AW_ION_CACHED_FLAG | AW_ION_CACHED_NEEDS_SYNC_FLAG;
alloc_data.fd = 0;
alloc_data.unused = 0;
ioctl(ion_fd, AW_ION_IOC_NEW_ALLOC, &alloc_data);

void* virt_addr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, alloc_data.fd, 0);

struct aw_user_iommu_param iommu_param;
iommu_param.fd = alloc_data.fd;
iommu_param.iommu_addr = 0;
ioctl(cedar_fd, IOCTL_GET_IOMMU_ADDR, &iommu_param);

mem->size = size;
mem->fd = alloc_data.fd;
mem->virt_addr = virt_addr;
mem->phy_addr = iommu_param.iommu_addr;

return 0;
}

int ion_allocator::free(struct ion_memory* mem)
{
if (mem->fd == -1)
return 0;

struct aw_user_iommu_param iommu_param;
iommu_param.fd = mem->fd;
ioctl(cedar_fd, IOCTL_FREE_IOMMU_ADDR, &iommu_param);

munmap(mem->virt_addr, mem->size);

::close(mem->fd);

mem->size = 0;
mem->fd = -1;
mem->virt_addr = 0;
mem->phy_addr = 0;

return 0;
}

int ion_allocator::flush(struct ion_memory* mem)
{
struct dma_buf_sync sync;
sync.flags = _BUF_SYNC_END | DMA_BUF_SYNC_RW;
ioctl(mem->fd, DMA_BUF_IOCTL_SYNC, &sync);

return 0;
}

然后再完成 G2D图形硬件 YUV转RGB 的

将YUV数据拷贝到dmaion buffer，flush cache实现同步

flush cache实现同步，从dmaion buffer拷贝出RGB数据

ion_allocator ion;
ion.open();

struct ion_memory yuv_ion;
ion.alloc(rgb_size, &rgb_ion);

struct ion_memory rgb_ion;
ion.alloc(yuv_size, &yuv_ion);

int g2d_fd = ::open("/dev/g2d", O_RDWR步调2

mpy((unsigned char*)yuv_ion.virt_addr, yuv420sp, yuv_size);
ion.flush(&yuv_ion步调3

g2d_blt_h blit;
memset(&blit, 0, sizeof(blit));

blit.flag_h = G2D_BLT_NONE_H;

blit.src_image_h.format = G2D_FORMAT_YUV420UVC_V1U1V0U0;
blit.src_image_h.width = width;
blit.src_image_h.height = height;
blit.src_image_h.align[0] = 0;
blit.src_image_h.align[1] = 0;
blit.src_image_h.clip_rect.x = 0;
blit.src_image_h.clip_rect.y = 0;
blit.src_image_h.clip_rect.w = width;
blit.src_image_h.clip_rect.h = height;
blit.src_image_h.gamut = G2D_BT601;
blit.src_image_h.bpremul = 0;
blit.src_image_h.mode = G2D_PIXEL_ALPHA;
blit.src_image_h.use_phy_addr = 0;
blit.src_image_h.fd = yuv_ion.fd;

blit.dst_image_h.format = G2D_FORMAT_RGB888;
blit.dst_image_h.width = width;
blit.dst_image_h.height = height;
blit.dst_image_h.align[0] = 0;
blit.dst_image_h.clip_rect.x = 0;
blit.dst_image_h.clip_rect.y = 0;
blit.dst_image_h.clip_rect.w = width;
blit.dst_image_h.clip_rect.h = height;
blit.dst_image_h.gamut = G2D_BT601;
blit.dst_image_h.bpremul = 0;
blit.dst_image_h.mode = G2D_PIXEL_ALPHA;
blit.dst_image_h.use_phy_addr = 0;
blit.dst_image_h.fd = rgb_ion.fd;

ioctl(g2d_fd, G2D_CMD_BITBLT_H, &blit步调4

ion.flush(&rgb_ion);
memcpy(rgb, (const unsigned char*)rgb_ion.virt_addr, rgb_size步调5

ion.free(&rgb_ion);
ion.free(&yuv_ion);
ion.close();
::close(g2d_fd);

G2D图象硬件YUV转RGB测试

考虑到dmaion buffer调配开释比拟耗时咱们提早做好轮回挪用步调3的G2D转换，统计耗时，并在top对象检察占用率

sh-4.4# LD_LIBRARY_PATH=. ./g2dtest
INFO : cedarc : register mjpeg decoder success!
this device is not whitelisted for jpeg decoder cvi
this device is not whitelisted for jpeg decoder cvi
this device is not whitelisted for jpeg decoder cvi
this device is not whitelisted for jpeg encoder rkmpp
INFO : cedarc : Set log level to 5 from /vendor/etc/cedarc.conf
ERROR : cedarc : now cedarc log level:5
ERROR : cedarc : now cedarc log level:5
yuv420sp2rgb 46.61
yuv420sp2rgb 42.04
yuv420sp2rgb 41.32
yuv420sp2rgb 42.06
yuv420sp2rgb 41.69
yuv420sp2rgb 42.05
yuv420sp2rgb 41.29
yuv420sp2rgb 41.30
yuv420sp2rgb 42.14
yuv420sp2rgb 41.33
yuv420sp2rgb_neon 10.57
yuv420sp2rgb_neon 7.21
yuv420sp2rgb_neon 6.77
yuv420sp2rgb_neon 8.31
yuv420sp2rgb_neon 7.60
yuv420sp2rgb_neon 6.80
yuv420sp2rgb_neon 6.77
yuv420sp2rgb_neon 7.01
yuv420sp2rgb_neon 7.11
yuv420sp2rgb_neon 7.06
yuv420sp2rgb_g2d 4.32
yuv420sp2rgb_g2d 4.69
yuv420sp2rgb_g2d 4.56
yuv420sp2rgb_g2d 4.57
yuv420sp2rgb_g2d 4.52
yuv420sp2rgb_g2d 4.54
yuv420sp2rgb_g2d 4.52
yuv420sp2rgb_g2d 4.58
yuv420sp2rgb_g2d 4.60
yuv420sp2rgb_g2d 4.67能够看到 ARM neon 的优化结果异常显然应用G2D图形硬件取得进一步加快，并且能显著下降CPU占用率！

转换效果比照阐发

C和neon的转换效果完整同等然则g2d转换后的图片显然的色差

G2D图形硬件支撑 G2D_BT601，G2D_BT709，G2D_BT2020 3种YUV系数，而JPG应用的YUV系数是改版BT601是以产生了色差

https://github.com/MYIR-ALLWINNER/myir-t1-kernel/blob/develop-yt113-L5.4.61/drivers/char/sunxi_g2d/g2d_bsp_v2.c

从g2d内核驱动能够得悉，暂时没有要领为g2d配置自定义的YUV系数，g2d不适合用于JPG的编解码依旧适宜和视频编解码色彩空间转换

锐单商城拥有海量元器件数据手册、IC替代型号，打造电子元器件IC百科大全！

G2D图像处理硬件调用和测试-基于米尔-全志T113-i开发板

相关文章