Here you have the second round.
I think to have enough data to affirm the following:
1) current hand made asm has some serious bug about both correctness and
efficiency
2) naive approach has a non marginal failure rate
3) server-less approach is much less efficient than server based
approach
Take in consideration these results:
$ ./sum 2048 8 16384
CPU clock: 1460475899.426625
mix_areas0: 87191 0.032139%
mix_areas1: 145666 0.053692% (365)
mix_areas2: 3034611 1.118555% (1217)
mix_areas3: 327412 0.120684% (0)
The server based approach needs about 0.03% of CPU power to mix one
stream stereo s16 @44100 Hz.
The fastest server-less approach I'm now able to invent needs 0.12%.
It's not a big fraction, but I think that to have a lot of machine power
available is never an excuse to waste it.
The naive approach is much better but with 8 random mixed streams it has
near 18% probability to give wrong results (with an average power for
each stream of 25%).
I'd suggest to use dmix like approach for pcm_share, pcm_snoop and for
the sum part of pcm_mix, but to use a separate thread for saturate,
transfer to hardware and silence.
I hope this will be useful for ALSA to take the right path.
P.S. I was almost forgetting how much is enjoyable to work for ALSA ;-)
--
Abramo Bagnara mailto:[EMAIL PROTECTED]
Opera Unica Phone: +39.546.656023
Via Emilia Interna, 140
48014 Castel Bolognese (RA) - Italy
#include <stdlib.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
#define rdtscll(val) \
__asm__ __volatile__("rdtsc" : "=A" (val))
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
typedef short int s16;
typedef int s32;
#ifdef CONFIG_SMP
#define LOCK_PREFIX "lock ; "
#else
#define LOCK_PREFIX ""
#endif
struct __xchg_dummy { unsigned long a[100]; };
#define __xg(x) ((struct __xchg_dummy *)(x))
static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
unsigned long new, int size)
{
unsigned long prev;
switch (size) {
case 1:
__asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
: "=a"(prev)
: "q"(new), "m"(*__xg(ptr)), "0"(old)
: "memory");
return prev;
case 2:
__asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
: "=a"(prev)
: "q"(new), "m"(*__xg(ptr)), "0"(old)
: "memory");
return prev;
case 4:
__asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
: "=a"(prev)
: "q"(new), "m"(*__xg(ptr)), "0"(old)
: "memory");
return prev;
}
return old;
}
#define cmpxchg(ptr,o,n)\
((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
(unsigned long)(n),sizeof(*(ptr))))
static inline void atomic_add(volatile int *dst, int v)
{
__asm__ __volatile__(
LOCK_PREFIX "addl %1,%0"
:"=m" (*dst)
:"ir" (v), "m" (*dst));
}
static double
detect_cpu_clock()
{
struct timeval tm_begin, tm_end;
unsigned long long tsc_begin, tsc_end;
/* Warm cache */
gettimeofday(&tm_begin, 0);
rdtscll(tsc_begin);
gettimeofday(&tm_begin, 0);
usleep(1000000);
rdtscll(tsc_end);
gettimeofday(&tm_end, 0);
return (tsc_end - tsc_begin) / (tm_end.tv_sec - tm_begin.tv_sec +
(tm_end.tv_usec - tm_begin.tv_usec) / 1e6);
}
void mix_areas0(unsigned int size,
const s16 *src,
volatile s32 *sum,
unsigned int src_step)
{
while (size-- > 0) {
atomic_add(sum, *src);
src += src_step;
sum++;
}
}
void saturate(unsigned int size,
s16 *dst, const s32 *sum,
unsigned int dst_step)
{
while (size-- > 0) {
s32 sample = *sum;
if (unlikely(sample < -0x8000))
*dst = -0x8000;
else if (unlikely(sample > 0x7fff))
*dst = 0x7fff;
else
*dst = sample;
dst += dst_step;
sum++;
}
}
void mix_areas1(unsigned int size,
volatile s16 *dst, const s16 *src,
unsigned int dst_step, unsigned int src_step)
{
while (size-- > 0) {
s32 sample = *dst + *src;
if (unlikely(sample < -0x8000))
*dst = -0x8000;
else if (unlikely(sample > 0x7fff))
*dst = 0x7fff;
else
*dst = sample;
dst += dst_step;
src += src_step;
}
}
void mix_areas2(unsigned int size,
volatile s16 *dst, const s16 *src,
volatile s32 *sum, unsigned int dst_step,
unsigned int src_step, unsigned int sum_step)
{
/*
* ESI - src
* EDI - dst
* EBX - sum
* ECX - old sample
* EAX - sample / temporary
* EDX - size
*/
__asm__ __volatile__ (
"\n"
/*
* initialization, load EDX, ESI, EDI, EBX registers
*/
"\tmovl %0, %%edx\n"
"\tmovl %1, %%edi\n"
"\tmovl %2, %%esi\n"
"\tmovl %3, %%ebx\n"
/*
* while (size-- > 0) {
*/
"\tcmp $0, %%edx\n"
"jz 6f\n"
"1:"
/*
* sample = *src;
* if (cmpxchg(*dst, 0, 1) == 0)
* sample -= *sum;
* xadd(*sum, sample);
*/
"\tmovw $0, %%ax\n"
"\tmovw $1, %%cx\n"
"\tlock; cmpxchgw %%cx, (%%edi)\n"
"\tmovswl (%%esi), %%ecx\n"
"\tjnz 2f\n"
"\tsubl (%%ebx), %%ecx\n"
"2:"
"\tlock; addl %%ecx, (%%ebx)\n"
/*
* do {
* sample = old_sample = *sum;
* saturate(v);
* *dst = sample;
* } while (v != *sum);
*/
"3:"
"\tmovl (%%ebx), %%ecx\n"
"\tcmpl $0x7fff,%%ecx\n"
"\tjg 4f\n"
"\tcmpl $-0x8000,%%ecx\n"
"\tjl 5f\n"
"\tmovw %%cx, (%%edi)\n"
"\tcmpl %%ecx, (%%ebx)\n"
"\tjnz 3b\n"
/*
* while (size-- > 0)
*/
"\tadd %4, %%edi\n"
"\tadd %5, %%esi\n"
"\tadd %6, %%ebx\n"
"\tdecl %%edx\n"
"\tjnz 1b\n"
"\tjmp 6f\n"
/*
* sample > 0x7fff
*/
"4:"
"\tmovw $0x7fff, %%ax\n"
"\tmovw %%ax, (%%edi)\n"
"\tcmpl %%ecx,(%%ebx)\n"
"\tjnz 3b\n"
"\tadd %4, %%edi\n"
"\tadd %5, %%esi\n"
"\tadd %6, %%ebx\n"
"\tdecl %%edx\n"
"\tjnz 1b\n"
"\tjmp 6f\n"
/*
* sample < -0x8000
*/
"5:"
"\tmovw $-0x8000, %%ax\n"
"\tmovw %%ax, (%%edi)\n"
"\tcmpl %%ecx, (%%ebx)\n"
"\tjnz 3b\n"
"\tadd %4, %%edi\n"
"\tadd %5, %%esi\n"
"\tadd %6, %%ebx\n"
"\tdecl %%edx\n"
"\tjnz 1b\n"
// "\tjmp 6f\n"
"6:"
: /* no output regs */
: "m" (size), "m" (dst), "m" (src), "m" (sum), "m" (dst_step), "m"
(src_step), "m" (sum_step)
: "esi", "edi", "edx", "ecx", "ebx", "eax"
);
}
void mix_areas3(unsigned int size,
volatile s16 *dst, const s16 *src,
volatile s32 *sum,
unsigned int dst_step, unsigned int src_step)
{
while (size-- > 0) {
s32 sample = *src;
if (cmpxchg(dst, 0, 1) == 0)
sample -= *sum;
atomic_add(sum, sample);
do {
sample = *sum;
if (unlikely(sample < -0x8000))
*dst = -0x8000;
else if (unlikely(sample > 0x7fff))
*dst = 0x7fff;
else
*dst = sample;
} while (unlikely(sample != *sum));
sum++;
dst += dst_step;
src += src_step;
}
}
int compare(const s16* b1, const s16 *b2, unsigned int size)
{
unsigned int c = 0;
while (size-- > 0) {
if (*b1 != *b2)
c++;
b1++;
b2++;
}
return c;
}
int main(int argc, char **argv)
{
int size = atoi(argv[1]);
int n = atoi(argv[2]);
int max = atoi(argv[3]);
int i;
unsigned long long begin, end;
s16 *dst = malloc(sizeof(*dst) * size);
s16 *check = malloc(sizeof(*check) * size);
s32 *sum = malloc(sizeof(*sum) * size);
s16 **srcs = malloc(sizeof(*srcs) * n);
double cpu_clock = detect_cpu_clock();
printf("CPU clock: %f\n", cpu_clock);
for (i = 0; i < n; i++) {
int k;
s16 *s;
srcs[i] = s = malloc(sizeof(s16) * size);
for (k = 0; k < size; ++k, ++s) {
*s = (rand() % (max * 2)) - max;
}
}
memset(sum, 0, sizeof(*sum) * size);
rdtscll(begin);
for (i = 0; i < n; i++) {
mix_areas0(size, srcs[i], sum, 1);
}
saturate(size, check, sum, 1);
rdtscll(end);
printf("mix_areas0: %lld %f%%\n", end - begin, 100*2*44100.0*(end -
begin)/(size*n*cpu_clock));
memset(dst, 0, sizeof(*dst) * size);
rdtscll(begin);
for (i = 0; i < n; i++) {
mix_areas1(size, dst, srcs[i], 1, 1);
}
rdtscll(end);
printf("mix_areas1: %lld %f%% (%d)\n", end - begin, 100*2*44100.0*(end -
begin)/(size*n*cpu_clock), compare(dst, check, size));
memset(sum, 0, sizeof(*sum) * size);
rdtscll(begin);
for (i = 0; i < n; i++) {
mix_areas2(size, dst, srcs[i], sum, 1, 1, 1);
}
rdtscll(end);
printf("mix_areas2: %lld %f%% (%d)\n", end - begin, 100*2*44100.0*(end -
begin)/(size*n*cpu_clock), compare(dst, check, size));
memset(sum, 0, sizeof(*sum) * size);
rdtscll(begin);
for (i = 0; i < n; i++) {
mix_areas3(size, dst, srcs[i], sum, 1, 1);
}
rdtscll(end);
printf("mix_areas3: %lld %f%% (%d)\n", end - begin, 100*2*44100.0*(end -
begin)/(size*n*cpu_clock), compare(dst, check, size));
return 0;
}