You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

267 lines
8.3 KiB

@*****************************************************************************
@ simple_channel_mixer.S : ARM NEON channel mixer
@*****************************************************************************
@ Copyright (C) 2012 David Geldreich <david.geldreich@free.fr>
@ Sébastien Toque
@
@ This program is free software; you can redistribute it and/or modify it
@ under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
#include "asm.S"
#if HAVE_AS_FPU_DIRECTIVE
.fpu neon
#endif
.text
.align 2
#define DST r0
#define SRC r1
#define NUM r2
#define LFE r3
#define COEFF r4
coeff_7to2:
.float 0.5
.float 0.5
.float 0.25
.float 0.25
function convert_7_x_to_2_0_neon_asm
push {r4,lr}
adr COEFF, coeff_7to2
vld1.32 {q0},[COEFF]
0: @ use local label
vld1.32 {q2},[SRC]! @ load 0,1,2,3
vmul.f32 q2,q2,q0 @ 0.5*src[0] 0.5*src[1] 0.25*src[2] 0.25*src[3]
vld1.32 {d6},[SRC]! @ load 4,5
vmul.f32 d6,d6,d1 @ 0.25*src[4] 0.25*src[5]
vadd.f32 d4,d4,d5 @ 0.5*src[0] + 0.25*src[2]
@ 0.5*src[1] + 0.25*src[3]
vadd.f32 d4,d4,d6 @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4]
@ 0.5*src[1] + 0.25*src[3] + 0.25*src[5]
flds s14,[SRC] @ load 6
vdup.32 d7,d7[0]
teq LFE,#0
ite eq
addeq SRC,SRC,#4
addne SRC,SRC,#8 @ skip the lfe channel
vadd.f32 d4,d4,d7 @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4] + src[6]
@ 0.5*src[1] + 0.25*src[3] + 0.25*src[5] + src[6]
vst1.32 d4, [DST]!
subs NUM,NUM,#1
bne 0b
pop {r4,pc}
coeff_5to2:
.float 0.5
.float 0.5
.float 0.33
.float 0.33
function convert_5_x_to_2_0_neon_asm
push {r4,lr}
adr COEFF, coeff_5to2
vld1.32 {q0},[COEFF] @ load constants
0: @ use local label
vld1.32 {q1},[SRC]! @ load 0,1,2,3
flds s8,[SRC] @ load 4
vdup.32 d4,d4[0]
teq LFE,#0
ite eq
addeq SRC,SRC,#4
addne SRC,SRC,#8 @ skip the lfe channel
vmul.f32 q1,q1,q0 @ 0.5*src[0] 0.5*src[1] 0.33*src[2] 0.33*src[3]/3
vadd.f32 d2,d2,d3 @ 0.5*src[0] + 0.33*src[2]
@ 0.5*src[1] + 0.33*src[3]
vadd.f32 d2,d2,d4 @ 0.5*src[0] + 0.33*src[2] + src[4]
@ 0.5*src[1] + 0.33*src[3] + src[4]
vst1.32 d2,[DST]!
subs NUM,NUM,#1
bne 0b
pop {r4,pc}
coeff_4to2:
.float 0.5
.float 0.5
function convert_4_0_to_2_0_neon_asm
push {r4,lr}
adr COEFF, coeff_4to2
vld1.32 {d0},[COEFF] @ load constants
0: @ use local label
vld1.32 {q1},[SRC]!
vmul.f32 d2,d2,d0 @ 0.5*src[0] 0.5*src[1]
vdup.32 d4,d3[0] @ dup src[2]
vdup.32 d3,d3[1] @ dup src[3]
vadd.f32 d2,d2,d3 @ +src[3]
vadd.f32 d2,d2,d4 @ +src[2]
vst1.32 d2,[DST]!
subs NUM,NUM,#1
bne 0b
pop {r4,pc}
coeff_3to2:
.float 0.5
.float 0.5
function convert_3_x_to_2_0_neon_asm
push {r4,lr}
adr COEFF, coeff_3to2
vld1.32 {d0},[COEFF] @ load constants
0: @ use local label
vld1.32 {d1},[SRC]! @ load 0,1
flds s4,[SRC] @ load 2
vdup.32 d2,d2[0]
teq LFE,#0
ite eq
addeq SRC,SRC,#4
addne SRC,SRC,#8 @ skip the lfe channel
vmul.f32 d1,d1,d0 @ 0.5*src[0] 0.5*src[1]
vadd.f32 d1,d1,d2 @ 0.5*src[0] + src[2]
@ 0.5*src[1] + src[2]
vst1.32 d1,[DST]!
subs NUM,NUM,#1
bne 0b
pop {r4,pc}
coeff_7to1:
.float 0.25
.float 0.25
.float 0.125
.float 0.125
function convert_7_x_to_1_0_neon_asm
push {r4,lr}
adr COEFF, coeff_7to1
vld1.32 {q0},[COEFF]
0: @ use local label
vld1.32 {q1},[SRC]! @ load 0,1,2,3
vmul.f32 q1,q1,q0 @ 0.25*src[0] 0.25*src[1] 0.125*src[2] 0.125*src[3]
vld1.32 {d4},[SRC]! @ load 4,5
vmul.f32 d4,d4,d1 @ 0.125*src[4] 0.125*src[5]
vadd.f32 d2,d2,d3
vadd.f32 d2,d2,d4
flds s10,[SRC] @ load 6
teq LFE,#0
ite eq
addeq SRC,SRC,#4
addne SRC,SRC,#8 @ skip the lfe channel
vadd.f32 s4,s4,s5
vadd.f32 s4,s4,s10
fsts s4,[DST]
add DST,DST,#4
subs NUM,NUM,#1
bne 0b
pop {r4,pc}
coeff_5to1:
.float 0.25
.float 0.25
.float 0.16666667
.float 0.16666667
function convert_5_x_to_1_0_neon_asm
push {r4,lr}
adr COEFF, coeff_5to1
vld1.32 {q0},[COEFF]
0: @ use local label
vld1.32 {q1},[SRC]! @ load 0,1,2,3
vmul.f32 q1,q1,q0 @ 0.25*src[0] 0.25*src[1] src[2]/6 src[3]/6
vadd.f32 d2,d2,d3
flds s10,[SRC] @ load 4
teq LFE,#0
ite eq
addeq SRC,SRC,#4
addne SRC,SRC,#8 @ skip the lfe channel
vadd.f32 s4,s4,s5
vadd.f32 s4,s4,s10
fsts s4,[DST]
add DST,DST,#4
subs NUM,NUM,#1
bne 0b
pop {r4,pc}
coeff_7to4:
.float 0.5
.float 0.5
.float 0.16666667
.float 0.16666667
function convert_7_x_to_4_0_neon_asm
push {r4,lr}
adr COEFF, coeff_7to4
vld1.32 {q0},[COEFF]
0: @ use local label
vld1.32 {q1},[SRC]! @ load 0,1,2,3
vmul.f32 q1,q1,q0 @ 0.5*src[0] 0.5*src[1] src[2]/6 src[3]/6
vld1.32 {d5},[SRC]! @ load 4,5
flds s14,[SRC] @ load 6
vadd.f32 d2,d2,d3 @ 0.5*src[0] + src[2]/6
@ 0.5*src[1] + src[3]/6
vdup.32 d4,d7[0] @ so q2 : src[6] src[6] src[4] src[5]
vadd.f32 q2,q2,q1 @ src[6] + 0.5*src[0] + src[2]/6
@ src[6] + 0.5*src[1] + src[3]/6
@ src[4] + src[2]/6
@ src[5] + src[3]/6
teq LFE,#0
ite eq
addeq SRC,SRC,#4
addne SRC,SRC,#8 @ skip the lfe channel
vst1.32 {q2}, [DST]!
subs NUM,NUM,#1
bne 0b
pop {r4,pc}
coeff_5to4:
.float 0.5
.float 0.5
function convert_5_x_to_4_0_neon_asm
push {r4,lr}
adr COEFF, coeff_5to4
vld1.32 {d0},[COEFF]
0: @ use local label
vld1.32 {q1},[SRC]! @ load 0,1,2,3
vmul.f32 d2,d2,d0 @ 0.5*src[0] 0.5*src[1]
flds s8,[SRC] @ load 4
vdup.32 d4,d4[0]
vadd.f32 d2,d2,d4 @ 0.5*src[0] + src[4]
@ 0.5*src[1] + src[4]
@ src[2]
@ src[3]
teq LFE,#0
ite eq
addeq SRC,SRC,#4
addne SRC,SRC,#8 @ skip the lfe channel
vst1.32 {q1}, [DST]!
subs NUM,NUM,#1
bne 0b
pop {r4,pc}