git am
on Windows
Note to self: on Windows use git am --whitespace=nowarn --keep-cr
to apply commits
previously created by format-patch
. WTF?
posted at: 16:32 | path: /programming | permanent link
Here are some thoughts on fefe's counting newlines problem (in German).
Bottom line: SIMD is nifty!
posted at: 14:27 | path: /programming | permanent link
Linux has a relatively new (2.6.13, June 2005), easy-to-use interface to watch and monitor file operation: inotify. See the Linux Journal article also.
I plan to use it to see when new devices show up under /dev
:
#includestatic const char *progname = "ino"; static const char *inotify_mask_str(unsigned mask) { switch (mask) { case IN_CREATE: return "create"; case IN_DELETE: return "delete"; default: return "???"; } } int main() { int fd = inotify_init1(0); if (fd < 0) { fprintf(stderr, "%s: init failed\n", progname); exit(EXIT_FAILURE); } int wd = inotify_add_watch(fd, "/dev", IN_CREATE | IN_DELETE); if (wd < 0) { fprintf(stderr, "%s: add watch failed\n", progname); exit(EXIT_FAILURE); } while (true) { uint8_t buf[sizeof(struct inotify_event) + FILENAME_MAX + 1]; ssize_t ret = read(fd, &buf, sizeof(buf)); if (ret < 0) { if (errno == EINTR) continue; fprintf(stderr, "%s: read failed\n", progname); exit(EXIT_FAILURE); } size_t processed = 0; while (processed < ret - sizeof(struct inotify_event)) { struct inotify_event *iev = (struct inotify_event *) &buf[processed]; printf("%4zd:%08x %s %s\n", ret, iev->mask, inotify_mask_str(iev->mask), iev->name); processed += sizeof(struct inotify_event) + iev->len; } } close(wd); close(fd); return EXIT_SUCCESS; }
posted at: 15:31 | path: /programming | permanent link
Encoding image data to JPEG is straightforward: use libjpeg. However, if you care about runtime performance, things get a bit more complicated... Here are some observations on the task on a ARM Cortex-A8 CPU.
dct_method = JDCT_IFAST
.
# time ./cjpeg -dct int -q 70 test.ppm > /tmp/test.jpg real 0m 1.70s # time ./cjpeg -dct fast -q 70 test.ppm > /tmp/test.jpg real 0m 1.04s
in_color_space = JCS_YCbCr
to avoid extra effort.
raw_data_in = TRUE
is used. The raw interface requires rearranging input data, however.
-O2
).
struct jpeg_compress_struct cinfo; struct jpeg_error_mgr jerr; cinfo.err = jpeg_std_error(&jerr); jpeg_create_compress(&cinfo); cinfo.image_width = w; cinfo.image_height = h; cinfo.input_components = 3; cinfo.in_color_space = JCS_YCbCr; // input color space jpeg_mem_dest(&cinfo, &outbuf, outbuf_size); jpeg_set_defaults(&cinfo); cinfo.dct_method = JDCT_IFAST; // DCT method // set up subsampling cinfo.raw_data_in = TRUE; cinfo.comp_info[0].h_samp_factor = 2; cinfo.comp_info[0].v_samp_factor = 2; cinfo.comp_info[1].h_samp_factor = 1; cinfo.comp_info[1].v_samp_factor = 1; cinfo.comp_info[2].h_samp_factor = 1; cinfo.comp_info[2].v_samp_factor = 1; jpeg_set_quality(&cinfo, 70, TRUE); jpeg_start_compress(&cinfo, TRUE); // allocate input data buffer JSAMPIMAGE data = malloc(sizeof(JSAMPARRAY) * cinfo.input_components); data[0] = malloc(sizeof(JSAMPROW) * (16 + 8 + 8)); data[1] = data[0] + 16; data[2] = data[0] + 16 + 8; // Y component data[0][0] = malloc(sizeof(JSAMPLE) * cinfo.image_width * 16); for (unsigned i = 1; i < 16; i++) data[0][i] = data[0][i-1] + cinfo.image_width; // U component data[1][0] = malloc(sizeof(JSAMPLE) * cinfo.image_width * 8 / 2); for (unsigned i = 1; i < 8; i++) data[1][i] = data[1][i-1] + cinfo.image_width / 2; // V component data[2][0] = malloc(sizeof(JSAMPLE) * cinfo.image_width * 8 / 2); for (unsigned i = 1; i < 8; i++) data[2][i] = data[2][i-1] + cinfo.image_width / 2; JSAMPLE *in = inbuf; for (unsigned i = 0; i < cinfo.image_height; i += 16) { JSAMPLE *yp = data[0][0], *up = data[1][0], *vp = data[2][0]; for (unsigned j = 0; j < 16; j += 2) { for (unsigned k = 0; k < cinfo.image_width * 2; k += 4) { *up++ = *in++; // assume UYVY *yp++ = *in++; *vp++ = *in++; *yp++ = *in++; } for (unsigned k = 0; k < cinfo.image_width * 2; k += 4) { in++; // subsample by dropping chroma data on odd lines *yp++ = *in++; in++; *yp++ = *in++; } } jpeg_write_raw_data(&cinfo, data, 16); } free(data[0][0]); free(data[1][0]); free(data[2][0]); free(data[0]); free(data); jpeg_finish_compress(&cinfo); jpeg_destroy_compress(&cinfo);
posted at: 15:19 | path: /programming | permanent link
How to efficiently convert an audio sample in 16-bit signed integer format to a 32-bit float value on an ARM NEON CPU? And how to achieve bit-exact results?
There are several ways to do it in different projects:
s16 -> float |
float -> s16 |
|
PulseAudio | flt = sample / (float) 0x7fff; |
sample = lrintf(clip_flt(flt) * 0x7fff) |
libavresample | flt = sample / (float) (1<<15); |
sample = (s16) clip_s16(lrintf(flt * (1 << 15))); |
RtAudio | flt = (sample + 0.5f) * (1 / 32767.5f); |
sample = (s16) (flt * 32767.5f - 0.5f); |
clip_s16()
saturates a 16-bit short integer (-32768..32767); clip_flt()
returns a float -1.0..1.0.
Observations regarding PulseAudio:
flt_to_s16(s16_to_flt(x)) != x
for x == -32768
x / (float) 0x7fff != x * (1.0f / 0x7fff)
on the other hand
x / (float) (1<<15) == x * (1.0f / (1<<15))
and the second form would allow to avoid division in favour of
multiplication of the inverse; the problem with the first form is a slight deviation for certain input values
lrintf()
rounds according to the current rounding mode which by default is
round-toward-nearest integer, toward-even for tie breaking). For example, this means:
12.3 -> 12 12.5 -> 12 (!) 12.7 -> 13 13.3 -> 13 13.5 -> 14 (!) 13.7 -> 14So .5 values are rounded to an even value.
static void float_to_s16(const float *src, int16_t *dst) { __asm__ __volatile__ ( "vdup.f32 q2, %[two23] \n\t" "vdup.f32 q3, %[scale] \n\t" "vdup.u32 q4, %[mask] \n\t" "vdup.f32 q5, %[mone] \n\t" "vld1.32 {q0}, [%[src]]! \n\t" /* load x */ "vmaxq.f32 q0, q0, q5 \n\t" /* clip at -1.0 */ "vmul.f32 q0, q0, q3 \n\t" /* scale */ "vand.u32 q1, q0, q4 \n\t" /* get sign bit */ "vorr.u32 q1, q1, q2 \n\t" /* put sign on 2^23 */ "vadd.f32 q0, q1, q0 \n\t" /* sgn(x)*2^23 + x ... */ "vsub.f32 q0, q0, q1 \n\t" /* ... - sgn(x)*2^23 */ "vcvt.s32.f32 q0, q0 \n\t" /* convert to int */ "vqmovn.s32 d0, q0 \n\t" /* saturate and narrow */ "vst1.16 {d0}, [%[dst]]! \n\t" : [dst] "+r" (dst), [src] "+r" (src) /* output operands (or input operands that get modified) */ : [scale] "r" (32767.0f), [two23] "r" (8.3886080000e+06f), [mask] "r" (0x80000000), [mone] "r" (-1.0f) /* input operands */ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5" /* clobber list */ ); }Observations:
vmaxq
instruction is needed to match PulseAudio's clipping sematics (clip -1.0f to -32767 instead of -32768); otherwise, vqmovn
takes care of narrowing a 32-bit signed integer to 16-bit and saturation.
vand
) and or-ing the sign to the two23
value.
lrintf()
rounding we have: one extra vand, vor, vadd, vsub
-- not bad!
sample / (float) 0x7fff;
without actually performing the costly division. I am not saying that this makes much sense, but hey, we can :-)
First we observe that a discrepancy (i.e. sample / (float) 0x7fff != sample * (1.0f / 0x7fff)
) occurs when the binary representation of the input value
converted to float ends in 0x4000 (that is, q0 & 0xffff == 0x4000
after the vcvt
instruction). There are 1536 such problematic values over all possible inputs.
static void s16_to_float(const int16_t *src, float *dst) { __asm__ __volatile__ ( "vdup.f32 q1, %[invscale] \n\t" "vdup.u16 q3, %[mask] \n\t" "vdup.u32 q4, %[one] \n\t" "vld1.16 {d0}, [%[src]]! \n\t" /* load x */ "vmovl.s16 q0, d0 \n\t" /* s16 -> s32 */ "vcvt.f32.s32 q0, q0 \n\t" /* s32 -> float */ "vceq.u16 q2, q0, q3 \n\t" /* check for defect */ "vand.u32 q2, q2, q4 \n\t" /* prepare 1 if defect */ "vmul.f32 q0, q0, q1 \n\t" /* multiply by invscale */ "vadd.u32 q0, q0, q2 \n\t" /* correct if defect */ "vst1.32 {q0}, [%[dst]]! \n\t" : [dst] "+r" (dst), [src] "+r" (src) /* output operands (or input operands that get modified) */ : [invscale] "r" (invscale), [mask] "r" (0x4000), [one] "r" (1) /* input operands */ : "memory", "cc", "q0", "q1", "q2", "q3", "q4" /* clobber list */ ); }Observations:
vceq
check for the problem condition; it sets each matching 16-bit word to 0xffff if is 0x4000.
vand
just keeps the LSB in each 32-bit word, hence a 1 indicated the problem condition for each float.
vadd
adds the correction bit to the multiplication result -- making multiplication by the inverse of 0x7fff identical to divsion by 0x7fff.
vceq, vand, vadd
-- not bad!
posted at: 11:17 | path: /programming | permanent link