364 lines
13 KiB
ArmAsm
Executable File
364 lines
13 KiB
ArmAsm
Executable File
/***************************************************************************
|
|
* __________ __ ___.
|
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
|
* \/ \/ \/ \/ \/
|
|
* $Id$
|
|
*
|
|
* Copyright (C) 2008 by Andree Buschmann
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
|
* KIND, either express or implied.
|
|
*
|
|
****************************************************************************/
|
|
|
|
#include "mpc_config.h"
|
|
|
|
.section .text, "ax", %progbits
|
|
|
|
#if defined(OPTIMIZE_FOR_SPEED)
|
|
/****************************************************************************
|
|
* void mpc_decoder_windowing_D(...)
|
|
*
|
|
* 2nd step within synthesis filter. Does the dewindowing.
|
|
* 32=32x32 multiplies (OPTIMIZE_FOR_SPEED)
|
|
* Uses pre-shifted V[] and D[] values. D[] will always be the second operand
|
|
* of mul/mla to achieve higher speed as D[] has lower amplitude than V[].
|
|
****************************************************************************/
|
|
.align 2
|
|
.global mpc_decoder_windowing_D
|
|
.type mpc_decoder_windowing_D, %function
|
|
mpc_decoder_windowing_D:
|
|
/* r0 = Data[] */
|
|
/* r1 = V[] */
|
|
/* r2 = D[] */
|
|
/* lr = counter */
|
|
|
|
stmfd sp!, {r4-r12, lr}
|
|
|
|
mov lr, #32
|
|
.loop32:
|
|
ldmia r2!, { r3-r10 } /* load D[00..07] */
|
|
ldr r11, [r1] /* 0 */
|
|
mul r12, r11, r3
|
|
ldr r11, [r1, #96*4] /* 1 */
|
|
mla r12, r11, r4, r12
|
|
ldr r11, [r1, #128*4] /* 2 */
|
|
mla r12, r11, r5, r12
|
|
ldr r11, [r1, #224*4] /* 3 */
|
|
mla r12, r11, r6, r12
|
|
ldr r11, [r1, #256*4] /* 4 */
|
|
mla r12, r11, r7, r12
|
|
ldr r11, [r1, #352*4] /* 5 */
|
|
mla r12, r11, r8, r12
|
|
ldr r11, [r1, #384*4] /* 6 */
|
|
mla r12, r11, r9, r12
|
|
ldr r11, [r1, #480*4] /* 7 */
|
|
mla r12, r11, r10, r12
|
|
ldmia r2!, { r3-r10 } /* load D[08..15] */
|
|
ldr r11, [r1, #512*4] /* 8 */
|
|
mla r12, r11, r3, r12
|
|
ldr r11, [r1, #608*4] /* 9 */
|
|
mla r12, r11, r4, r12
|
|
ldr r11, [r1, #640*4] /* 10 */
|
|
mla r12, r11, r5, r12
|
|
ldr r11, [r1, #736*4] /* 11 */
|
|
mla r12, r11, r6, r12
|
|
ldr r11, [r1, #768*4] /* 12 */
|
|
mla r12, r11, r7, r12
|
|
ldr r11, [r1, #864*4] /* 13 */
|
|
mla r12, r11, r8, r12
|
|
ldr r11, [r1, #896*4] /* 14 */
|
|
mla r12, r11, r9, r12
|
|
ldr r11, [r1, #992*4] /* 15 */
|
|
mla r12, r11, r10, r12
|
|
mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */
|
|
str r12, [r0], #4 /* store Data */
|
|
add r1, r1, #4 /* V++ */
|
|
|
|
subs lr, lr, #1
|
|
bgt .loop32
|
|
|
|
ldmfd sp!, {r4-r12, pc}
|
|
.mpc_dewindowing_end:
|
|
.size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
|
|
#else
|
|
/****************************************************************************
|
|
* void mpc_decoder_windowing_D(...)
|
|
*
|
|
* 2nd step within synthesis filter. Does the dewindowing.
|
|
* 64=32x32 multiplies
|
|
* Uses un-shifted D[]-values. D[] will always be the second operand of
|
|
* smull/smlal to achieve higher speed as D[] has lower amplitude than V[].
|
|
****************************************************************************/
|
|
.align 2
|
|
.global mpc_decoder_windowing_D
|
|
.type mpc_decoder_windowing_D, %function
|
|
#if 0
|
|
mpc_decoder_windowing_D:
|
|
/* r0 = Data[] */
|
|
/* r1 = V[] */
|
|
/* r2 = D[] */
|
|
/* lr = counter */
|
|
/************************************************************************
|
|
* Reference implementation.
|
|
***********************************************************************/
|
|
stmfd sp!, {r4-r9, lr}
|
|
|
|
mov lr, #32
|
|
.loop32:
|
|
ldmia r2!, { r3-r6 } /* load D[00..03] */
|
|
ldr r7, [r1] /* 0 */
|
|
smull r8, r9, r7, r3
|
|
ldr r7, [r1, #96*4] /* 1 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #128*4] /* 2 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #224*4] /* 3 */
|
|
smlal r8, r9, r7, r6
|
|
ldmia r2!, { r3-r6 } /* load D[04..07] */
|
|
ldr r7, [r1, #256*4] /* 4 */
|
|
smlal r8, r9, r7, r3
|
|
ldr r7, [r1, #352*4] /* 5 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #384*4] /* 6 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #480*4] /* 7 */
|
|
smlal r8, r9, r7, r6
|
|
ldmia r2!, { r3-r6 } /* load D[08..11] */
|
|
ldr r7, [r1, #512*4] /* 8 */
|
|
smlal r8, r9, r7, r3
|
|
ldr r7, [r1, #608*4] /* 9 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #640*4] /* 10 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #736*4] /* 11 */
|
|
smlal r8, r9, r7, r6
|
|
ldmia r2!, { r3-r6 } /* load D[12..15] */
|
|
ldr r7, [r1, #768*4] /* 12 */
|
|
smlal r8, r9, r7, r3
|
|
ldr r7, [r1, #864*4] /* 13 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #896*4] /* 14 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #992*4] /* 15 */
|
|
smlal r8, r9, r7, r6
|
|
mov r8, r8, lsr #16
|
|
orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
|
|
str r8, [r0], #4 /* store Data */
|
|
add r1, r1, #4 /* V++ */
|
|
|
|
subs lr, lr, #1
|
|
bgt .loop32
|
|
|
|
ldmfd sp!, {r4-r9, pc}
|
|
#else
|
|
mpc_decoder_windowing_D:
|
|
/* r0 = Data[] */
|
|
/* r1 = V[] */
|
|
/* r2 = D[] */
|
|
/* lr = counter */
|
|
/************************************************************************
|
|
* Further speed up through making use of symmetries within D[]-window.
|
|
* The row V[00] can be extracted as it has symmetries within this single
|
|
* row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
|
|
* The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
|
|
* saved at the cost of 15 x 4 + 1 add's.
|
|
* The row V[16] can be extracted as it has symmetries within this single
|
|
* row. 8 smull/mlal and 8 ldr's can be saved.
|
|
***********************************************************************/
|
|
stmfd sp!, {r4-r12, lr}
|
|
|
|
/******************************************
|
|
* row 0 with internal symmetry
|
|
*****************************************/
|
|
add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
|
|
ldmia r2!, { r3-r6 } /* load D[01..04] */
|
|
ldr r7 , [r1, #96*4] /* 1 */
|
|
ldr r10, [r1, #992*4] /* 15 */
|
|
rsb r10, r10, r7 /* V[01] - V[15] */
|
|
smull r8, r9, r10, r3
|
|
ldr r7 , [r1, #128*4] /* 2 */
|
|
ldr r10, [r1, #896*4] /* 14 */
|
|
add r10, r10, r7 /* V[02] + V[14] */
|
|
smlal r8, r9, r10, r4
|
|
ldr r7 , [r1, #224*4] /* 3 */
|
|
ldr r10, [r1, #864*4] /* 13 */
|
|
rsb r10, r10, r7 /* V[03] - V[13] */
|
|
smlal r8, r9, r10, r5
|
|
ldr r7 , [r1, #256*4] /* 4 */
|
|
ldr r10, [r1, #768*4] /* 12 */
|
|
add r10, r10, r7 /* V[04] + V[12] */
|
|
smlal r8, r9, r10, r6
|
|
ldmia r2!, { r3-r6 } /* load D[05..08] */
|
|
ldr r7 , [r1, #352*4] /* 5 */
|
|
ldr r10, [r1, #736*4] /* 11 */
|
|
rsb r10, r10, r7 /* V[05] - V[11] */
|
|
smlal r8, r9, r10, r3
|
|
ldr r7 , [r1, #384*4] /* 6 */
|
|
ldr r10, [r1, #640*4] /* 10 */
|
|
add r10, r10, r7 /* V[06] + V[10] */
|
|
smlal r8, r9, r10, r4
|
|
ldr r7 , [r1, #480*4] /* 7 */
|
|
ldr r10, [r1, #608*4] /* 9 */
|
|
rsb r10, r10, r7 /* V[07] - V[09] */
|
|
smlal r8, r9, r10, r5
|
|
ldr r10, [r1, #512*4] /* 8 */
|
|
smlal r8, r9, r10, r6
|
|
mov r8, r8, lsr #16
|
|
orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
|
|
str r8, [r0], #4 /* store Data */
|
|
add r1, r1, #4 /* V+=1, r1 = V[01] */
|
|
add r2, r2, #7*4 /* D+=7, r2 = D[16] */
|
|
|
|
/******************************************
|
|
* rows 01..15 are symmetrc to rows 31..17
|
|
* r8 = lo, r9 = hi of 01..15
|
|
* r1 = V[01..15]
|
|
* r10 = lo, r11 = hi of 31..17
|
|
* r12 = V[31..16]
|
|
*****************************************/
|
|
mov lr, #15
|
|
add r12, r1, #30*4 /* r12 = V[31] */
|
|
.loop15:
|
|
ldmia r2!, { r3-r6 } /* load D[00..03] */
|
|
ldr r7, [r12, #768*4] /* 12 */
|
|
smull r10, r11, r7, r6
|
|
ldr r7, [r12, #864*4] /* 13 */
|
|
smlal r10, r11, r7, r5
|
|
ldr r7, [r12, #896*4] /* 14 */
|
|
smlal r10, r11, r7, r4
|
|
ldr r7, [r12, #992*4] /* 15 */
|
|
smlal r10, r11, r7, r3
|
|
ldr r7, [r1] /* 0 */
|
|
smull r8, r9, r7, r3
|
|
ldr r7, [r1, #96*4] /* 1 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #128*4] /* 2 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #224*4] /* 3 */
|
|
smlal r8, r9, r7, r6
|
|
ldmia r2!, { r3-r6 } /* load D[04..07] */
|
|
ldr r7, [r1, #256*4] /* 4 */
|
|
smlal r8, r9, r7, r3
|
|
ldr r7, [r1, #352*4] /* 5 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #384*4] /* 6 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #480*4] /* 7 */
|
|
smlal r8, r9, r7, r6
|
|
ldr r7, [r12, #512*4] /* 8 */
|
|
smlal r10, r11, r7, r6
|
|
ldr r7, [r12, #608*4] /* 9 */
|
|
smlal r10, r11, r7, r5
|
|
ldr r7, [r12, #640*4] /* 10 */
|
|
smlal r10, r11, r7, r4
|
|
ldr r7, [r12, #736*4] /* 11 */
|
|
smlal r10, r11, r7, r3
|
|
ldmia r2!, { r3-r6 } /* load D[08..11] */
|
|
ldr r7, [r12, #256*4] /* 4 */
|
|
smlal r10, r11, r7, r6
|
|
ldr r7, [r12, #352*4] /* 5 */
|
|
smlal r10, r11, r7, r5
|
|
ldr r7, [r12, #384*4] /* 6 */
|
|
smlal r10, r11, r7, r4
|
|
ldr r7, [r12, #480*4] /* 7 */
|
|
smlal r10, r11, r7, r3
|
|
ldr r7, [r1, #512*4] /* 8 */
|
|
smlal r8, r9, r7, r3
|
|
ldr r7, [r1, #608*4] /* 9 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #640*4] /* 10 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #736*4] /* 11 */
|
|
smlal r8, r9, r7, r6
|
|
ldmia r2!, { r3-r6 } /* load D[12..15] */
|
|
ldr r7, [r1, #768*4] /* 12 */
|
|
smlal r8, r9, r7, r3
|
|
ldr r7, [r1, #864*4] /* 13 */
|
|
smlal r8, r9, r7, r4
|
|
ldr r7, [r1, #896*4] /* 14 */
|
|
smlal r8, r9, r7, r5
|
|
ldr r7, [r1, #992*4] /* 15 */
|
|
smlal r8, r9, r7, r6
|
|
ldr r7, [r12] /* 0 */
|
|
smlal r10, r11, r7, r6
|
|
ldr r7, [r12, #96*4] /* 1 */
|
|
smlal r10, r11, r7, r5
|
|
ldr r7, [r12, #128*4] /* 2 */
|
|
smlal r10, r11, r7, r4
|
|
ldr r7, [r12, #224*4] /* 3 */
|
|
smlal r10, r11, r7, r3
|
|
/* store Data[01..15] */
|
|
mov r8, r8, lsr #16
|
|
orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
|
|
str r8, [r0] /* store Data */
|
|
/* store Data[31..17] */
|
|
add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */
|
|
mov r10, r10, lsr #16
|
|
orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
|
|
rsb r10, r10, #0 /* r10 = -r10 */
|
|
str r10, [r0], #4 /* store Data */
|
|
sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */
|
|
/* correct adresses for next loop */
|
|
sub r12, r12, #4 /* r12 = V-- */
|
|
add r1, r1, #4 /* r1 = V++ */
|
|
/* next loop */
|
|
subs lr, lr, #1
|
|
bgt .loop15
|
|
|
|
/******************************************
|
|
* V[16] with internal symmetry
|
|
*****************************************/
|
|
ldmia r2!, { r3-r6 } /* load D[00..03] */
|
|
ldr r7 , [r1] /* 0 */
|
|
ldr r10, [r1, #992*4] /* 15 */
|
|
rsb r10, r10, r7 /* V[00] - V[15] */
|
|
smull r8, r9, r10, r3
|
|
ldr r7 , [r1, #96*4] /* 1 */
|
|
ldr r10, [r1, #896*4] /* 14 */
|
|
rsb r10, r10, r7 /* V[01] - V[14] */
|
|
smlal r8, r9, r10, r4
|
|
ldr r7 , [r1, #128*4] /* 2 */
|
|
ldr r10, [r1, #864*4] /* 13 */
|
|
rsb r10, r10, r7 /* V[02] - V[13] */
|
|
smlal r8, r9, r10, r5
|
|
ldr r7 , [r1, #224*4] /* 3 */
|
|
ldr r10, [r1, #768*4] /* 12 */
|
|
rsb r10, r10, r7 /* V[03] - V[12] */
|
|
smlal r8, r9, r10, r6
|
|
ldmia r2!, { r3-r6 } /* load D[04..07] */
|
|
ldr r7 , [r1, #256*4] /* 4 */
|
|
ldr r10, [r1, #736*4] /* 11 */
|
|
rsb r10, r10, r7 /* V[04] - V[11] */
|
|
smlal r8, r9, r10, r3
|
|
ldr r7 , [r1, #352*4] /* 5 */
|
|
ldr r10, [r1, #640*4] /* 10 */
|
|
rsb r10, r10, r7 /* V[05] - V[10] */
|
|
smlal r8, r9, r10, r4
|
|
ldr r7 , [r1, #384*4] /* 6 */
|
|
ldr r10, [r1, #608*4] /* 9 */
|
|
rsb r10, r10, r7 /* V[06] - V[09] */
|
|
smlal r8, r9, r10, r5
|
|
ldr r7 , [r1, #480*4] /* 7 */
|
|
ldr r10, [r1, #512*4] /* 8 */
|
|
rsb r10, r10, r7 /* V[07] - V[08] */
|
|
smlal r8, r9, r10, r6
|
|
mov r8, r8, lsr #16
|
|
orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
|
|
str r8, [r0], #4 /* store Data */
|
|
add r1, r1, #4 /* V++ */
|
|
|
|
ldmfd sp!, {r4-r12, pc}
|
|
#endif
|
|
.mpc_dewindowing_end:
|
|
.size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
|
|
#endif
|