pmeerw's blog

Sun, 26 Jan 2014

Encoding UYVY data to JPEG efficiently

Encoding image data to JPEG is straightforward: use libjpeg. However, if you care about runtime performance, things get a bit more complicated... Here are some observations on the task on a ARM Cortex-A8 CPU.

  1. There is libjpeg-turbo which is a drop-in replacement for libjpeg and offers SIMD (in our case ARM NEON) support.
  2. Use dct_method = JDCT_IFAST.
    # time ./cjpeg -dct int -q 70 test.ppm > /tmp/test.jpg
    real	0m 1.70s
    # time ./cjpeg -dct fast -q 70 test.ppm > /tmp/test.jpg
    real	0m 1.04s
    
  3. Avoid color-space conversion: JPEG uses YUV internally, RGB input is converted to YUV before compression. Use in_color_space = JCS_YCbCr to avoid extra effort.
  4. Avoid downsampling. UYVY data has only half the chroma resolution in horizontal direction. JPEG usually does chroma subsampling in both directions. The libjpeg interface cannot handle irregular input data unless raw_data_in = TRUE is used. The raw interface requires rearranging input data, however.
  5. Prepare input buffers efficiently, compiler settings matter (-O2).

struct jpeg_compress_struct cinfo;
struct jpeg_error_mgr jerr;
cinfo.err = jpeg_std_error(&jerr);

jpeg_create_compress(&cinfo);
cinfo.image_width = w;
cinfo.image_height = h;
cinfo.input_components = 3;
cinfo.in_color_space = JCS_YCbCr; // input color space

jpeg_mem_dest(&cinfo, &outbuf, outbuf_size);
jpeg_set_defaults(&cinfo);
cinfo.dct_method = JDCT_IFAST; // DCT method

// set up subsampling
cinfo.raw_data_in = TRUE;
cinfo.comp_info[0].h_samp_factor = 2;
cinfo.comp_info[0].v_samp_factor = 2;
cinfo.comp_info[1].h_samp_factor = 1;
cinfo.comp_info[1].v_samp_factor = 1;
cinfo.comp_info[2].h_samp_factor = 1;
cinfo.comp_info[2].v_samp_factor = 1;                                                

jpeg_set_quality(&cinfo, 70, TRUE);
jpeg_start_compress(&cinfo, TRUE);

// allocate input data buffer
JSAMPIMAGE data = malloc(sizeof(JSAMPARRAY) * cinfo.input_components);
data[0] = malloc(sizeof(JSAMPROW) * (16 + 8 + 8));
data[1] = data[0] + 16;
data[2] = data[0] + 16 + 8;

// Y component
data[0][0] = malloc(sizeof(JSAMPLE) * cinfo.image_width * 16);
for (unsigned i = 1; i < 16; i++)
  data[0][i] = data[0][i-1] + cinfo.image_width;

// U component
data[1][0] = malloc(sizeof(JSAMPLE) * cinfo.image_width * 8 / 2);
for (unsigned i = 1; i < 8; i++)
  data[1][i] = data[1][i-1] + cinfo.image_width / 2;
  
// V component
data[2][0] = malloc(sizeof(JSAMPLE) * cinfo.image_width * 8 / 2);
for (unsigned i = 1; i < 8; i++)
  data[2][i] = data[2][i-1] + cinfo.image_width / 2;

JSAMPLE *in = inbuf;
for (unsigned i = 0; i < cinfo.image_height; i += 16) {
  JSAMPLE *yp = data[0][0], *up = data[1][0], *vp = data[2][0];
  for (unsigned j = 0; j < 16; j += 2) {
    for (unsigned k = 0; k < cinfo.image_width * 2; k += 4) {
      *up++ = *in++; // assume UYVY
      *yp++ = *in++;
      *vp++ = *in++;
      *yp++ = *in++;
    }
    for (unsigned k = 0; k < cinfo.image_width * 2; k += 4) {
      in++; // subsample by dropping chroma data on odd lines
      *yp++ = *in++;
      in++;
      *yp++ = *in++;
    }
  }
  jpeg_write_raw_data(&cinfo, data, 16);
}
  
free(data[0][0]);
free(data[1][0]);
free(data[2][0]);
free(data[0]);
free(data);

jpeg_finish_compress(&cinfo);
jpeg_destroy_compress(&cinfo);

posted at: 15:19 | path: /programming | permanent link

Made with PyBlosxom