diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /media/libtheora/lib/arm | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'media/libtheora/lib/arm')
-rw-r--r-- | media/libtheora/lib/arm/arm2gnu.pl | 281 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armbits.h | 32 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armbits.s | 236 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armcpu.c | 116 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armcpu.h | 29 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armfrag.s | 662 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armidct.s | 1914 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armint.h | 126 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armloop.s | 682 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armopts.s | 39 | ||||
-rw-r--r-- | media/libtheora/lib/arm/armstate.c | 219 |
11 files changed, 4336 insertions, 0 deletions
diff --git a/media/libtheora/lib/arm/arm2gnu.pl b/media/libtheora/lib/arm/arm2gnu.pl new file mode 100644 index 000000000..5831bd81e --- /dev/null +++ b/media/libtheora/lib/arm/arm2gnu.pl @@ -0,0 +1,281 @@ +#!/usr/bin/perl + +my $bigend; # little/big endian +my $nxstack; + +$nxstack = 0; + +eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' + if $running_under_some_shell; + +while ($ARGV[0] =~ /^-/) { + $_ = shift; + last if /^--/; + if (/^-n/) { + $nflag++; + next; + } + die "I don't recognize this switch: $_\\n"; +} +$printit++ unless $nflag; + +$\ = "\n"; # automatically add newline on print +$n=0; + +$thumb = 0; # ARM mode by default, not Thumb. + +LINE: +while (<>) { + + # For ADRLs we need to add a new line after the substituted one. + $addPadding = 0; + + # First, we do not dare to touch *anything* inside double quotes, do we? + # Second, if you want a dollar character in the string, + # insert two of them -- that's how ARM C and assembler treat strings. + s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next }; + s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next }; + s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next }; + # If there's nothing on a line but a comment, don't try to apply any further + # substitutions (this is a cheap hack to avoid mucking up the license header) + s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next }; + # If substituted -- leave immediately ! + + s/@/,:/; + s/;/@/; + while ( /@.*'/ ) { + s/(@.*)'/$1/g; + } + s/\{FALSE\}/0/g; + s/\{TRUE\}/1/g; + s/\{(\w\w\w\w+)\}/$1/g; + s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/; + s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/; + s/\bIMPORT\b/.extern/; + s/\bEXPORT\b/.global/; + s/^(\s+)\[/$1IF/; + s/^(\s+)\|/$1ELSE/; + s/^(\s+)\]/$1ENDIF/; + s/IF *:DEF:/ .ifdef/; + s/IF *:LNOT: *:DEF:/ .ifndef/; + s/ELSE/ .else/; + s/ENDIF/ .endif/; + + if( /\bIF\b/ ) { + s/\bIF\b/ .if/; + s/=/==/; + } + if ( $n == 2) { + s/\$/\\/g; + } + if ($n == 1) { + s/\$//g; + s/label//g; + $n = 2; + } + if ( /MACRO/ ) { + s/MACRO *\n/.macro/; + $n=1; + } + if ( /\bMEND\b/ ) { + s/\bMEND\b/.endm/; + $n=0; + } + + # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there. + # + if ( /\bAREA\b/ ) { + if ( /CODE/ ) { + $nxstack = 1; + } + s/^(.+)CODE(.+)READONLY(.*)/ .text/; + s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata\n .align 2/; + s/^(.+)\|\|\.data\|\|(.+)/ .data\n .align 2/; + s/^(.+)\|\|\.bss\|\|(.+)/ .bss/; + } + + s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3|| + s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2|| + s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2|| + s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/; + s/^(\s+)\%(\s)/ .space $1/; + + s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123 + s/\bCODE32\b/.code 32/ && do {$thumb = 0}; + s/\bCODE16\b/.code 16/ && do {$thumb = 1}; + if (/\bPROC\b/) + { + print " .thumb_func" if ($thumb); + s/\bPROC\b/@ $&/; + } + s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/; + s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/; + s/\bENDP\b/@ $&/; + s/\bSUBT\b/@ $&/; + s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25 + s/\bKEEP\b/@ $&/; + s/\bEXPORTAS\b/@ $&/; + s/\|\|(.)+\bEQU\b/@ $&/; + s/\|\|([\w\$]+)\|\|/$1/; + s/\bENTRY\b/@ $&/; + s/\bASSERT\b/@ $&/; + s/\bGBLL\b/@ $&/; + s/\bGBLA\b/@ $&/; + s/^\W+OPT\b/@ $&/; + s/:OR:/|/g; + s/:SHL:/<</g; + s/:SHR:/>>/g; + s/:AND:/&/g; + s/:LAND:/&&/g; + s/CPSR/cpsr/; + s/SPSR/spsr/; + s/ALIGN$/.balign 4/; + s/ALIGN\s+([0-9x]+)$/.balign $1/; + s/psr_cxsf/psr_all/; + s/LTORG/.ltorg/; + s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/; + + # {PC} + 0xdeadfeed --> . + 0xdeadfeed + s/\{PC\} \+/ \. +/; + + # Single hex constant on the line ! + # + # >>> NOTE <<< + # Double-precision floats in gcc are always mixed-endian, which means + # bytes in two words are little-endian, but words are big-endian. + # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address + # and 0xfeed0000 at high address. + # + s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! + s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/; + + # Single hex constant on the line ! +# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! +# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/; + s/\bDCFS[ \t]+0x/.word 0x/; + s/\bDCFS\b/.float/; + + s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/; + s/\bDCD\b/.word/; + s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/; + s/\bDCW\b/.short/; + s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/; + s/\bDCB\b/.byte/; + s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/; + s/^[A-Za-z_\.]\w+/$&:/; + s/^(\d+)/$1:/; + s/\%(\d+)/$1b_or_f/; + s/\%[Bb](\d+)/$1b/; + s/\%[Ff](\d+)/$1f/; + s/\%[Ff][Tt](\d+)/$1f/; + s/&([\dA-Fa-f]+)/0x$1/; + if ( /\b2_[01]+\b/ ) { + s/\b2_([01]+)\b/conv$1&&&&/g; + while ( /[01][01][01][01]&&&&/ ) { + s/0000&&&&/&&&&0/g; + s/0001&&&&/&&&&1/g; + s/0010&&&&/&&&&2/g; + s/0011&&&&/&&&&3/g; + s/0100&&&&/&&&&4/g; + s/0101&&&&/&&&&5/g; + s/0110&&&&/&&&&6/g; + s/0111&&&&/&&&&7/g; + s/1000&&&&/&&&&8/g; + s/1001&&&&/&&&&9/g; + s/1010&&&&/&&&&A/g; + s/1011&&&&/&&&&B/g; + s/1100&&&&/&&&&C/g; + s/1101&&&&/&&&&D/g; + s/1110&&&&/&&&&E/g; + s/1111&&&&/&&&&F/g; + } + s/000&&&&/&&&&0/g; + s/001&&&&/&&&&1/g; + s/010&&&&/&&&&2/g; + s/011&&&&/&&&&3/g; + s/100&&&&/&&&&4/g; + s/101&&&&/&&&&5/g; + s/110&&&&/&&&&6/g; + s/111&&&&/&&&&7/g; + s/00&&&&/&&&&0/g; + s/01&&&&/&&&&1/g; + s/10&&&&/&&&&2/g; + s/11&&&&/&&&&3/g; + s/0&&&&/&&&&0/g; + s/1&&&&/&&&&1/g; + s/conv&&&&/0x/g; + } + + if ( /commandline/) + { + if( /-bigend/) + { + $bigend=1; + } + } + + if ( /\bDCDU\b/ ) + { + my $cmd=$_; + my $value; + my $w1; + my $w2; + my $w3; + my $w4; + + s/\s+DCDU\b/@ $&/; + + $cmd =~ /\bDCDU\b\s+0x(\d+)/; + $value = $1; + $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/; + $w1 = $1; + $w2 = $2; + $w3 = $3; + $w4 = $4; + + if( $bigend ne "") + { + # big endian + + print " .byte 0x".$w1; + print " .byte 0x".$w2; + print " .byte 0x".$w3; + print " .byte 0x".$w4; + } + else + { + # little endian + + print " .byte 0x".$w4; + print " .byte 0x".$w3; + print " .byte 0x".$w2; + print " .byte 0x".$w1; + } + + } + + + if ( /\badrl\b/i ) + { + s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i; + $addPadding = 1; + } + s/\bEND\b/@ END/; +} continue { + printf ("%s", $_) if $printit; + if ($addPadding != 0) + { + printf (" mov r0,r0\n"); + $addPadding = 0; + } +} +#If we had a code section, mark that this object doesn't need an executable +# stack. +if ($nxstack) { + printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n"); +} diff --git a/media/libtheora/lib/arm/armbits.h b/media/libtheora/lib/arm/armbits.h new file mode 100644 index 000000000..1540d7eb5 --- /dev/null +++ b/media/libtheora/lib/arm/armbits.h @@ -0,0 +1,32 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $ + + ********************************************************************/ +#if !defined(_arm_armbits_H) +# define _arm_armbits_H (1) +# include "../bitpack.h" +# include "armcpu.h" + +# if defined(OC_ARM_ASM) +# define oc_pack_read oc_pack_read_arm +# define oc_pack_read1 oc_pack_read1_arm +# define oc_huff_token_decode oc_huff_token_decode_arm +# endif + +long oc_pack_read_arm(oc_pack_buf *_b,int _bits); +int oc_pack_read1_arm(oc_pack_buf *_b); +int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree); + +#endif diff --git a/media/libtheora/lib/arm/armbits.s b/media/libtheora/lib/arm/armbits.s new file mode 100644 index 000000000..0fdb6fdd3 --- /dev/null +++ b/media/libtheora/lib/arm/armbits.s @@ -0,0 +1,236 @@ +;******************************************************************** +;* * +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * +;* * +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * +;* * +;******************************************************************** +; +; function: +; last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $ +; +;******************************************************************** + + AREA |.text|, CODE, READONLY + + ; Explicitly specifying alignment here because some versions of + ; gas don't align code correctly. See + ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html + ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 + ALIGN + + EXPORT oc_pack_read_arm + EXPORT oc_pack_read1_arm + EXPORT oc_huff_token_decode_arm + +oc_pack_read1_arm PROC + ; r0 = oc_pack_buf *_b + ADD r12,r0,#8 + LDMIA r12,{r2,r3} ; r2 = window + ; Stall... ; r3 = available + ; Stall... + SUBS r3,r3,#1 ; r3 = available-1, available<1 => LT + BLT oc_pack_read1_refill + MOV r0,r2,LSR #31 ; r0 = window>>31 + MOV r2,r2,LSL #1 ; r2 = window<<=1 + STMIA r12,{r2,r3} ; window = r2 + ; available = r3 + MOV PC,r14 + ENDP + +oc_pack_read_arm PROC + ; r0 = oc_pack_buf *_b + ; r1 = int _bits + ADD r12,r0,#8 + LDMIA r12,{r2,r3} ; r2 = window + ; Stall... ; r3 = available + ; Stall... + SUBS r3,r3,r1 ; r3 = available-_bits, available<_bits => LT + BLT oc_pack_read_refill + RSB r0,r1,#32 ; r0 = 32-_bits + MOV r0,r2,LSR r0 ; r0 = window>>32-_bits + MOV r2,r2,LSL r1 ; r2 = window<<=_bits + STMIA r12,{r2,r3} ; window = r2 + ; available = r3 + MOV PC,r14 + +; We need to refill window. +oc_pack_read1_refill + MOV r1,#1 +oc_pack_read_refill + STMFD r13!,{r10,r11,r14} + LDMIA r0,{r10,r11} ; r10 = stop + ; r11 = ptr + RSB r0,r1,#32 ; r0 = 32-_bits + RSB r3,r3,r0 ; r3 = 32-available +; We can use unsigned compares for both the pointers and for available +; (allowing us to chain condition codes) because available will never be +; larger than 32 (or we wouldn't be here), and thus 32-available will never be +; negative. + CMP r10,r11 ; ptr<stop => HI + CMPHI r3,#7 ; available<=24 => HI + LDRHIB r14,[r11],#1 ; r14 = *ptr++ + SUBHI r3,#8 ; available += 8 + ; (HI) Stall... + ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available + CMPHI r10,r11 ; ptr<stop => HI + CMPHI r3,#7 ; available<=24 => HI + LDRHIB r14,[r11],#1 ; r14 = *ptr++ + SUBHI r3,#8 ; available += 8 + ; (HI) Stall... + ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available + CMPHI r10,r11 ; ptr<stop => HI + CMPHI r3,#7 ; available<=24 => HI + LDRHIB r14,[r11],#1 ; r14 = *ptr++ + SUBHI r3,#8 ; available += 8 + ; (HI) Stall... + ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available + CMPHI r10,r11 ; ptr<stop => HI + CMPHI r3,#7 ; available<=24 => HI + LDRHIB r14,[r11],#1 ; r14 = *ptr++ + SUBHI r3,#8 ; available += 8 + ; (HI) Stall... + ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available + SUBS r3,r0,r3 ; r3 = available-=_bits, available<bits => GT + BLT oc_pack_read_refill_last + MOV r0,r2,LSR r0 ; r0 = window>>32-_bits + MOV r2,r2,LSL r1 ; r2 = window<<=_bits + STR r11,[r12,#-4] ; ptr = r11 + STMIA r12,{r2,r3} ; window = r2 + ; available = r3 + LDMFD r13!,{r10,r11,PC} + +; Either we wanted to read more than 24 bits and didn't have enough room to +; stuff the last byte into the window, or we hit the end of the packet. +oc_pack_read_refill_last + CMP r11,r10 ; ptr<stop => LO +; If we didn't hit the end of the packet, then pull enough of the next byte to +; to fill up the window. + LDRLOB r14,[r11] ; (LO) r14 = *ptr +; Otherwise, set the EOF flag and pretend we have lots of available bits. + MOVHS r14,#1 ; (HS) r14 = 1 + ADDLO r10,r3,r1 ; (LO) r10 = available + STRHS r14,[r12,#8] ; (HS) eof = 1 + ANDLO r10,r10,#7 ; (LO) r10 = available&7 + MOVHS r3,#1<<30 ; (HS) available = OC_LOTS_OF_BITS + ORRLO r2,r14,LSL r10 ; (LO) r2 = window|=*ptr>>(available&7) + MOV r0,r2,LSR r0 ; r0 = window>>32-_bits + MOV r2,r2,LSL r1 ; r2 = window<<=_bits + STR r11,[r12,#-4] ; ptr = r11 + STMIA r12,{r2,r3} ; window = r2 + ; available = r3 + LDMFD r13!,{r10,r11,PC} + ENDP + + + +oc_huff_token_decode_arm PROC + ; r0 = oc_pack_buf *_b + ; r1 = const ogg_int16_t *_tree + STMFD r13!,{r4,r5,r10,r14} + LDRSH r10,[r1] ; r10 = n=_tree[0] + LDMIA r0,{r2-r5} ; r2 = stop + ; Stall... ; r3 = ptr + ; Stall... ; r4 = window + ; r5 = available + CMP r10,r5 ; n>available => GT + BGT oc_huff_token_decode_refill0 + RSB r14,r10,#32 ; r14 = 32-n + MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n + ADD r14,r1,r14,LSL #1 ; r14 = _tree+bits + LDRSH r12,[r14,#2] ; r12 = node=_tree[1+bits] + ; Stall... + ; Stall... + RSBS r14,r12,#0 ; r14 = -node, node>0 => MI + BMI oc_huff_token_decode_continue + MOV r10,r14,LSR #8 ; r10 = n=node>>8 + MOV r4,r4,LSL r10 ; r4 = window<<=n + SUB r5,r10 ; r5 = available-=n + STMIB r0,{r3-r5} ; ptr = r3 + ; window = r4 + ; available = r5 + AND r0,r14,#255 ; r0 = node&255 + LDMFD r13!,{r4,r5,r10,pc} + +; The first tree node wasn't enough to reach a leaf, read another +oc_huff_token_decode_continue + ADD r12,r1,r12,LSL #1 ; r12 = _tree+node + MOV r4,r4,LSL r10 ; r4 = window<<=n + SUB r5,r5,r10 ; r5 = available-=n + LDRSH r10,[r12],#2 ; r10 = n=_tree[node] + ; Stall... ; r12 = _tree+node+1 + ; Stall... + CMP r10,r5 ; n>available => GT + BGT oc_huff_token_decode_refill + RSB r14,r10,#32 ; r14 = 32-n + MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n + ADD r12,r12,r14 ; + LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits] + ; Stall... + ; Stall... + RSBS r14,r12,#0 ; r14 = -node, node>0 => MI + BMI oc_huff_token_decode_continue + MOV r10,r14,LSR #8 ; r10 = n=node>>8 + MOV r4,r4,LSL r10 ; r4 = window<<=n + SUB r5,r10 ; r5 = available-=n + STMIB r0,{r3-r5} ; ptr = r3 + ; window = r4 + ; available = r5 + AND r0,r14,#255 ; r0 = node&255 + LDMFD r13!,{r4,r5,r10,pc} + +oc_huff_token_decode_refill0 + ADD r12,r1,#2 ; r12 = _tree+1 +oc_huff_token_decode_refill +; We can't possibly need more than 15 bits, so available must be <= 15. +; Therefore we can load at least two bytes without checking it. + CMP r2,r3 ; ptr<stop => HI + LDRHIB r14,[r3],#1 ; r14 = *ptr++ + RSBHI r5,r5,#24 ; (HI) available = 32-(available+=8) + RSBLS r5,r5,#32 ; (LS) r5 = 32-available + ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available + CMPHI r2,r3 ; ptr<stop => HI + LDRHIB r14,[r3],#1 ; r14 = *ptr++ + SUBHI r5,#8 ; available += 8 + ; (HI) Stall... + ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available +; We can use unsigned compares for both the pointers and for available +; (allowing us to chain condition codes) because available will never be +; larger than 32 (or we wouldn't be here), and thus 32-available will never be +; negative. + CMPHI r2,r3 ; ptr<stop => HI + CMPHI r5,#7 ; available<=24 => HI + LDRHIB r14,[r3],#1 ; r14 = *ptr++ + SUBHI r5,#8 ; available += 8 + ; (HI) Stall... + ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available + CMP r2,r3 ; ptr<stop => HI + MOVLS r5,#-1<<30 ; (LS) available = OC_LOTS_OF_BITS+32 + CMPHI r5,#7 ; (HI) available<=24 => HI + LDRHIB r14,[r3],#1 ; (HI) r14 = *ptr++ + SUBHI r5,#8 ; (HI) available += 8 + ; (HI) Stall... + ORRHI r4,r14,LSL r5 ; (HI) r4 = window|=r14<<32-available + RSB r14,r10,#32 ; r14 = 32-n + MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n + ADD r12,r12,r14 ; + LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits] + RSB r5,r5,#32 ; r5 = available + ; Stall... + RSBS r14,r12,#0 ; r14 = -node, node>0 => MI + BMI oc_huff_token_decode_continue + MOV r10,r14,LSR #8 ; r10 = n=node>>8 + MOV r4,r4,LSL r10 ; r4 = window<<=n + SUB r5,r10 ; r5 = available-=n + STMIB r0,{r3-r5} ; ptr = r3 + ; window = r4 + ; available = r5 + AND r0,r14,#255 ; r0 = node&255 + LDMFD r13!,{r4,r5,r10,pc} + ENDP + + END diff --git a/media/libtheora/lib/arm/armcpu.c b/media/libtheora/lib/arm/armcpu.c new file mode 100644 index 000000000..8b0f9a857 --- /dev/null +++ b/media/libtheora/lib/arm/armcpu.c @@ -0,0 +1,116 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + CPU capability detection for ARM processors. + + function: + last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $ + + ********************************************************************/ + +#include "armcpu.h" + +#if !defined(OC_ARM_ASM)|| \ + !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \ + !defined(OC_ARM_ASM_NEON) +ogg_uint32_t oc_cpu_flags_get(void){ + return 0; +} + +#elif defined(_MSC_VER) +/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ +# define WIN32_LEAN_AND_MEAN +# define WIN32_EXTRA_LEAN +# include <windows.h> + +ogg_uint32_t oc_cpu_flags_get(void){ + ogg_uint32_t flags; + flags=0; + /*MSVC has no inline __asm support for ARM, but it does let you __emit + instructions via their assembled hex code. + All of these instructions should be essentially nops.*/ +# if defined(OC_ARM_ASM_EDSP) + __try{ + /*PLD [r13]*/ + __emit(0xF5DDF000); + flags|=OC_CPU_ARM_EDSP; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# if defined(OC_ARM_ASM_MEDIA) + __try{ + /*SHADD8 r3,r3,r3*/ + __emit(0xE6333F93); + flags|=OC_CPU_ARM_MEDIA; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# if defined(OC_ARM_ASM_NEON) + __try{ + /*VORR q0,q0,q0*/ + __emit(0xF2200150); + flags|=OC_CPU_ARM_NEON; + } + __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ + /*Ignore exception.*/ + } +# endif +# endif +# endif + return flags; +} + +#elif defined(__linux__) +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +ogg_uint32_t oc_cpu_flags_get(void){ + ogg_uint32_t flags; + FILE *fin; + flags=0; + /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on + Android. + This also means that detection will fail in Scratchbox.*/ + fin=fopen("/proc/cpuinfo","r"); + if(fin!=NULL){ + /*512 should be enough for anybody (it's even enough for all the flags that + x86 has accumulated... so far).*/ + char buf[512]; + while(fgets(buf,511,fin)!=NULL){ + if(memcmp(buf,"Features",8)==0){ + char *p; + p=strstr(buf," edsp"); + if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP; + p=strstr(buf," neon"); + if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON; + } + if(memcmp(buf,"CPU architecture:",17)==0){ + int version; + version=atoi(buf+17); + if(version>=6)flags|=OC_CPU_ARM_MEDIA; + } + } + fclose(fin); + } + return flags; +} + +#else +/*The feature registers which can tell us what the processor supports are + accessible in priveleged modes only, so we can't have a general user-space + detection method like on x86.*/ +# error "Configured to use ARM asm but no CPU detection method available for " \ + "your platform. Reconfigure with --disable-asm (or send patches)." +#endif diff --git a/media/libtheora/lib/arm/armcpu.h b/media/libtheora/lib/arm/armcpu.h new file mode 100644 index 000000000..18dd95821 --- /dev/null +++ b/media/libtheora/lib/arm/armcpu.h @@ -0,0 +1,29 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + function: + last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $ + + ********************************************************************/ + +#if !defined(_arm_armcpu_H) +# define _arm_armcpu_H (1) +#include "../internal.h" + +/*"Parallel instructions" from ARM v6 and above.*/ +#define OC_CPU_ARM_MEDIA (1<<24) +/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/ +#define OC_CPU_ARM_EDSP (1<<7) +#define OC_CPU_ARM_NEON (1<<12) + +ogg_uint32_t oc_cpu_flags_get(void); + +#endif diff --git a/media/libtheora/lib/arm/armfrag.s b/media/libtheora/lib/arm/armfrag.s new file mode 100644 index 000000000..e20579eee --- /dev/null +++ b/media/libtheora/lib/arm/armfrag.s @@ -0,0 +1,662 @@ +;******************************************************************** +;* * +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * +;* * +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * +;* * +;******************************************************************** +; Original implementation: +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $ +;******************************************************************** + + AREA |.text|, CODE, READONLY + + ; Explicitly specifying alignment here because some versions of + ; gas don't align code correctly. See + ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html + ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 + ALIGN + + GET armopts.s + +; Vanilla ARM v4 versions + EXPORT oc_frag_copy_list_arm + EXPORT oc_frag_recon_intra_arm + EXPORT oc_frag_recon_inter_arm + EXPORT oc_frag_recon_inter2_arm + +oc_frag_copy_list_arm PROC + ; r0 = _dst_frame + ; r1 = _src_frame + ; r2 = _ystride + ; r3 = _fragis + ; <> = _nfragis + ; <> = _frag_buf_offs + LDR r12,[r13] ; r12 = _nfragis + STMFD r13!,{r4-r6,r11,r14} + SUBS r12, r12, #1 + LDR r4,[r3],#4 ; r4 = _fragis[fragii] + LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs + BLT ofcl_arm_end + SUB r2, r2, #4 +ofcl_arm_lp + LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]] + SUBS r12, r12, #1 + ; Stall (on XScale) + ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off + LDR r6, [r4], #4 + ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off + LDR r5, [r4], r2 + STR r6, [r11],#4 + LDR r6, [r4], #4 + STR r5, [r11],r2 + LDR r5, [r4], r2 + STR r6, [r11],#4 + LDR r6, [r4], #4 + STR r5, [r11],r2 + LDR r5, [r4], r2 + STR r6, [r11],#4 + LDR r6, [r4], #4 + STR r5, [r11],r2 + LDR r5, [r4], r2 + STR r6, [r11],#4 + LDR r6, [r4], #4 + STR r5, [r11],r2 + LDR r5, [r4], r2 + STR r6, [r11],#4 + LDR r6, [r4], #4 + STR r5, [r11],r2 + LDR r5, [r4], r2 + STR r6, [r11],#4 + LDR r6, [r4], #4 + STR r5, [r11],r2 + LDR r5, [r4], r2 + STR r6, [r11],#4 + LDR r6, [r4], #4 + STR r5, [r11],r2 + LDR r5, [r4] + LDRGE r4,[r3],#4 ; r4 = _fragis[fragii] + STR r6, [r11],#4 + STR r5, [r11] + BGE ofcl_arm_lp +ofcl_arm_end + LDMFD r13!,{r4-r6,r11,PC} +oc_frag_recon_intra_arm + ; r0 = unsigned char *_dst + ; r1 = int _ystride + ; r2 = const ogg_int16_t _residue[64] + STMFD r13!,{r4,r5,r14} + MOV r14,#8 + MOV r5, #255 + SUB r1, r1, #7 +ofrintra_lp_arm + LDRSH r3, [r2], #2 + LDRSH r4, [r2], #2 + LDRSH r12,[r2], #2 + ADDS r3, r3, #128 + CMPGT r5, r3 + EORLT r3, r5, r3, ASR #32 + STRB r3, [r0], #1 + ADDS r4, r4, #128 + CMPGT r5, r4 + EORLT r4, r5, r4, ASR #32 + LDRSH r3, [r2], #2 + STRB r4, [r0], #1 + ADDS r12,r12,#128 + CMPGT r5, r12 + EORLT r12,r5, r12,ASR #32 + LDRSH r4, [r2], #2 + STRB r12,[r0], #1 + ADDS r3, r3, #128 + CMPGT r5, r3 + EORLT r3, r5, r3, ASR #32 + LDRSH r12,[r2], #2 + STRB r3, [r0], #1 + ADDS r4, r4, #128 + CMPGT r5, r4 + EORLT r4, r5, r4, ASR #32 + LDRSH r3, [r2], #2 + STRB r4, [r0], #1 + ADDS r12,r12,#128 + CMPGT r5, r12 + EORLT r12,r5, r12,ASR #32 + LDRSH r4, [r2], #2 + STRB r12,[r0], #1 + ADDS r3, r3, #128 + CMPGT r5, r3 + EORLT r3, r5, r3, ASR #32 + STRB r3, [r0], #1 + ADDS r4, r4, #128 + CMPGT r5, r4 + EORLT r4, r5, r4, ASR #32 + STRB r4, [r0], r1 + SUBS r14,r14,#1 + BGT ofrintra_lp_arm + LDMFD r13!,{r4,r5,PC} + ENDP + +oc_frag_recon_inter_arm PROC + ; r0 = unsigned char *dst + ; r1 = const unsigned char *src + ; r2 = int ystride + ; r3 = const ogg_int16_t residue[64] + STMFD r13!,{r5,r9-r11,r14} + MOV r9, #8 + MOV r5, #255 + SUB r2, r2, #7 +ofrinter_lp_arm + LDRSH r12,[r3], #2 + LDRB r14,[r1], #1 + LDRSH r11,[r3], #2 + LDRB r10,[r1], #1 + ADDS r12,r12,r14 + CMPGT r5, r12 + EORLT r12,r5, r12,ASR #32 + STRB r12,[r0], #1 + ADDS r11,r11,r10 + CMPGT r5, r11 + LDRSH r12,[r3], #2 + LDRB r14,[r1], #1 + EORLT r11,r5, r11,ASR #32 + STRB r11,[r0], #1 + ADDS r12,r12,r14 + CMPGT r5, r12 + LDRSH r11,[r3], #2 + LDRB r10,[r1], #1 + EORLT r12,r5, r12,ASR #32 + STRB r12,[r0], #1 + ADDS r11,r11,r10 + CMPGT r5, r11 + LDRSH r12,[r3], #2 + LDRB r14,[r1], #1 + EORLT r11,r5, r11,ASR #32 + STRB r11,[r0], #1 + ADDS r12,r12,r14 + CMPGT r5, r12 + LDRSH r11,[r3], #2 + LDRB r10,[r1], #1 + EORLT r12,r5, r12,ASR #32 + STRB r12,[r0], #1 + ADDS r11,r11,r10 + CMPGT r5, r11 + LDRSH r12,[r3], #2 + LDRB r14,[r1], #1 + EORLT r11,r5, r11,ASR #32 + STRB r11,[r0], #1 + ADDS r12,r12,r14 + CMPGT r5, r12 + LDRSH r11,[r3], #2 + LDRB r10,[r1], r2 + EORLT r12,r5, r12,ASR #32 + STRB r12,[r0], #1 + ADDS r11,r11,r10 + CMPGT r5, r11 + EORLT r11,r5, r11,ASR #32 + STRB r11,[r0], r2 + SUBS r9, r9, #1 + BGT ofrinter_lp_arm + LDMFD r13!,{r5,r9-r11,PC} + ENDP + +oc_frag_recon_inter2_arm PROC + ; r0 = unsigned char *dst + ; r1 = const unsigned char *src1 + ; r2 = const unsigned char *src2 + ; r3 = int ystride + LDR r12,[r13] + ; r12= const ogg_int16_t residue[64] + STMFD r13!,{r4-r8,r14} + MOV r14,#8 + MOV r8, #255 + SUB r3, r3, #7 +ofrinter2_lp_arm + LDRB r5, [r1], #1 + LDRB r6, [r2], #1 + LDRSH r4, [r12],#2 + LDRB r7, [r1], #1 + ADD r5, r5, r6 + ADDS r5, r4, r5, LSR #1 + CMPGT r8, r5 + LDRB r6, [r2], #1 + LDRSH r4, [r12],#2 + EORLT r5, r8, r5, ASR #32 + STRB r5, [r0], #1 + ADD r7, r7, r6 + ADDS r7, r4, r7, LSR #1 + CMPGT r8, r7 + LDRB r5, [r1], #1 + LDRB r6, [r2], #1 + LDRSH r4, [r12],#2 + EORLT r7, r8, r7, ASR #32 + STRB r7, [r0], #1 + ADD r5, r5, r6 + ADDS r5, r4, r5, LSR #1 + CMPGT r8, r5 + LDRB r7, [r1], #1 + LDRB r6, [r2], #1 + LDRSH r4, [r12],#2 + EORLT r5, r8, r5, ASR #32 + STRB r5, [r0], #1 + ADD r7, r7, r6 + ADDS r7, r4, r7, LSR #1 + CMPGT r8, r7 + LDRB r5, [r1], #1 + LDRB r6, [r2], #1 + LDRSH r4, [r12],#2 + EORLT r7, r8, r7, ASR #32 + STRB r7, [r0], #1 + ADD r5, r5, r6 + ADDS r5, r4, r5, LSR #1 + CMPGT r8, r5 + LDRB r7, [r1], #1 + LDRB r6, [r2], #1 + LDRSH r4, [r12],#2 + EORLT r5, r8, r5, ASR #32 + STRB r5, [r0], #1 + ADD r7, r7, r6 + ADDS r7, r4, r7, LSR #1 + CMPGT r8, r7 + LDRB r5, [r1], #1 + LDRB r6, [r2], #1 + LDRSH r4, [r12],#2 + EORLT r7, r8, r7, ASR #32 + STRB r7, [r0], #1 + ADD r5, r5, r6 + ADDS r5, r4, r5, LSR #1 + CMPGT r8, r5 + LDRB r7, [r1], r3 + LDRB r6, [r2], r3 + LDRSH r4, [r12],#2 + EORLT r5, r8, r5, ASR #32 + STRB r5, [r0], #1 + ADD r7, r7, r6 + ADDS r7, r4, r7, LSR #1 + CMPGT r8, r7 + EORLT r7, r8, r7, ASR #32 + STRB r7, [r0], r3 + SUBS r14,r14,#1 + BGT ofrinter2_lp_arm + LDMFD r13!,{r4-r8,PC} + ENDP + + [ OC_ARM_ASM_EDSP + EXPORT oc_frag_copy_list_edsp + +oc_frag_copy_list_edsp PROC + ; r0 = _dst_frame + ; r1 = _src_frame + ; r2 = _ystride + ; r3 = _fragis + ; <> = _nfragis + ; <> = _frag_buf_offs + LDR r12,[r13] ; r12 = _nfragis + STMFD r13!,{r4-r11,r14} + SUBS r12, r12, #1 + LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] + LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs + BLT ofcl_edsp_end +ofcl_edsp_lp + MOV r4, r1 + LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]] + SUBS r12, r12, #1 + ; Stall (on XScale) + LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off + LDRD r8, [r4, r2]! + ; Stall + STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off + STRD r8, [r5, r2]! + ; Stall + LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive + LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse + LDRD r10,[r4, r2]! ; than us only doing 2, and having to do + ; another pair of LDRD/STRD later on. + ; Stall + STRD r6, [r5, r2]! + STRD r8, [r5, r2]! + STRD r10,[r5, r2]! + LDRD r6, [r4, r2]! + LDRD r8, [r4, r2]! + LDRD r10,[r4, r2]! + STRD r6, [r5, r2]! + STRD r8, [r5, r2]! + STRD r10,[r5, r2]! + LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] + BGE ofcl_edsp_lp +ofcl_edsp_end + LDMFD r13!,{r4-r11,PC} + ENDP + ] + + [ OC_ARM_ASM_MEDIA + EXPORT oc_frag_recon_intra_v6 + EXPORT oc_frag_recon_inter_v6 + EXPORT oc_frag_recon_inter2_v6 + +oc_frag_recon_intra_v6 PROC + ; r0 = unsigned char *_dst + ; r1 = int _ystride + ; r2 = const ogg_int16_t _residue[64] + STMFD r13!,{r4-r6,r14} + MOV r14,#8 + MOV r12,r2 + LDR r6, =0x00800080 +ofrintra_v6_lp + LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222 + LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666 + SUBS r14,r14,#1 + QADD16 r2, r2, r6 + QADD16 r3, r3, r6 + QADD16 r4, r4, r6 + QADD16 r5, r5, r6 + USAT16 r2, #8, r2 ; r2 = __11__00 + USAT16 r3, #8, r3 ; r3 = __33__22 + USAT16 r4, #8, r4 ; r4 = __55__44 + USAT16 r5, #8, r5 ; r5 = __77__66 + ORR r2, r2, r2, LSR #8 ; r2 = __111100 + ORR r3, r3, r3, LSR #8 ; r3 = __333322 + ORR r4, r4, r4, LSR #8 ; r4 = __555544 + ORR r5, r5, r5, LSR #8 ; r5 = __777766 + PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100 + PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544 + STRD r2, [r0], r1 + BGT ofrintra_v6_lp + LDMFD r13!,{r4-r6,PC} + ENDP + +oc_frag_recon_inter_v6 PROC + ; r0 = unsigned char *_dst + ; r1 = const unsigned char *_src + ; r2 = int _ystride + ; r3 = const ogg_int16_t _residue[64] + STMFD r13!,{r4-r7,r14} + MOV r14,#8 +ofrinter_v6_lp + LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222 + SUBS r14,r14,#1 + [ OC_ARM_CAN_UNALIGN_LDRD + LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544 + | + LDR r5, [r1, #4] + LDR r4, [r1], r2 + ] + PKHBT r12,r6, r7, LSL #16 ; r12= 22220000 + PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 + UXTB16 r6,r4 ; r6 = __22__00 + UXTB16 r4,r4, ROR #8 ; r4 = __33__11 + QADD16 r12,r12,r6 ; r12= xx22xx00 + QADD16 r4, r7, r4 ; r4 = xx33xx11 + LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666 + USAT16 r4, #8, r4 ; r4 = __33__11 + USAT16 r12,#8,r12 ; r12= __22__00 + ORR r4, r12,r4, LSL #8 ; r4 = 33221100 + PKHBT r12,r6, r7, LSL #16 ; r12= 66664444 + PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555 + UXTB16 r6,r5 ; r6 = __66__44 + UXTB16 r5,r5, ROR #8 ; r5 = __77__55 + QADD16 r12,r12,r6 ; r12= xx66xx44 + QADD16 r5, r7, r5 ; r5 = xx77xx55 + USAT16 r12,#8, r12 ; r12= __66__44 + USAT16 r5, #8, r5 ; r4 = __77__55 + ORR r5, r12,r5, LSL #8 ; r5 = 33221100 + STRD r4, [r0], r2 + BGT ofrinter_v6_lp + LDMFD r13!,{r4-r7,PC} + ENDP + +oc_frag_recon_inter2_v6 PROC + ; r0 = unsigned char *_dst + ; r1 = const unsigned char *_src1 + ; r2 = const unsigned char *_src2 + ; r3 = int _ystride + LDR r12,[r13] + ; r12= const ogg_int16_t _residue[64] + STMFD r13!,{r4-r9,r14} + MOV r14,#8 +ofrinter2_v6_lp + LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666 + SUBS r14,r14,#1 + LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544 + LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544 + PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444 + PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555 + UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1 + UXTB16 r5, r4 ; r5 = __66__44 + UXTB16 r4, r4, ROR #8 ; r4 = __77__55 + QADD16 r8, r8, r5 ; r8 = xx66xx44 + QADD16 r9, r9, r4 ; r9 = xx77xx55 + LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000 + USAT16 r8, #8, r8 ; r8 = __66__44 + LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100 + USAT16 r9, #8, r9 ; r9 = __77__55 + LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100 + ORR r9, r8, r9, LSL #8 ; r9 = 77665544 + PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000 + UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1 + PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 + UXTB16 r5, r4 ; r5 = __22__00 + UXTB16 r4, r4, ROR #8 ; r4 = __33__11 + QADD16 r8, r8, r5 ; r8 = xx22xx00 + QADD16 r7, r7, r4 ; r7 = xx33xx11 + USAT16 r8, #8, r8 ; r8 = __22__00 + USAT16 r7, #8, r7 ; r7 = __33__11 + ORR r8, r8, r7, LSL #8 ; r8 = 33221100 + STRD r8, [r0], r3 + BGT ofrinter2_v6_lp + LDMFD r13!,{r4-r9,PC} + ENDP + ] + + [ OC_ARM_ASM_NEON + EXPORT oc_frag_copy_list_neon + EXPORT oc_frag_recon_intra_neon + EXPORT oc_frag_recon_inter_neon + EXPORT oc_frag_recon_inter2_neon + +oc_frag_copy_list_neon PROC + ; r0 = _dst_frame + ; r1 = _src_frame + ; r2 = _ystride + ; r3 = _fragis + ; <> = _nfragis + ; <> = _frag_buf_offs + LDR r12,[r13] ; r12 = _nfragis + STMFD r13!,{r4-r7,r14} + CMP r12, #1 + LDRGE r6, [r3] ; r6 = _fragis[fragii] + LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs + BLT ofcl_neon_end + ; Stall (2 on Xscale) + LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] + ; Stall (on XScale) + MOV r7, r6 ; Guarantee PLD points somewhere valid. +ofcl_neon_lp + ADD r4, r1, r6 + VLD1.64 {D0}, [r4@64], r2 + ADD r5, r0, r6 + VLD1.64 {D1}, [r4@64], r2 + SUBS r12, r12, #1 + VLD1.64 {D2}, [r4@64], r2 + LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii] + VLD1.64 {D3}, [r4@64], r2 + LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] + VLD1.64 {D4}, [r4@64], r2 + ADDGT r7, r1, r6 + VLD1.64 {D5}, [r4@64], r2 + PLD [r7] + VLD1.64 {D6}, [r4@64], r2 + PLD [r7, r2] + VLD1.64 {D7}, [r4@64] + PLD [r7, r2, LSL #1] + VST1.64 {D0}, [r5@64], r2 + ADDGT r7, r7, r2, LSL #2 + VST1.64 {D1}, [r5@64], r2 + PLD [r7, -r2] + VST1.64 {D2}, [r5@64], r2 + PLD [r7] + VST1.64 {D3}, [r5@64], r2 + PLD [r7, r2] + VST1.64 {D4}, [r5@64], r2 + PLD [r7, r2, LSL #1] + VST1.64 {D5}, [r5@64], r2 + ADDGT r7, r7, r2, LSL #2 + VST1.64 {D6}, [r5@64], r2 + PLD [r7, -r2] + VST1.64 {D7}, [r5@64] + BGT ofcl_neon_lp +ofcl_neon_end + LDMFD r13!,{r4-r7,PC} + ENDP + +oc_frag_recon_intra_neon PROC + ; r0 = unsigned char *_dst + ; r1 = int _ystride + ; r2 = const ogg_int16_t _residue[64] + MOV r3, #128 + VDUP.S16 Q0, r3 + VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles + VQADD.S16 Q8, Q8, Q0 + VQADD.S16 Q9, Q9, Q0 + VQADD.S16 Q10,Q10,Q0 + VQADD.S16 Q11,Q11,Q0 + VQADD.S16 Q12,Q12,Q0 + VQADD.S16 Q13,Q13,Q0 + VQADD.S16 Q14,Q14,Q0 + VQADD.S16 Q15,Q15,Q0 + VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle + VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle + VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle + VST1.64 {D16},[r0@64], r1 + VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle + VST1.64 {D17},[r0@64], r1 + VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle + VST1.64 {D18},[r0@64], r1 + VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle + VST1.64 {D19},[r0@64], r1 + VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle + VST1.64 {D20},[r0@64], r1 + VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle + VST1.64 {D21},[r0@64], r1 + VST1.64 {D22},[r0@64], r1 + VST1.64 {D23},[r0@64], r1 + MOV PC,R14 + ENDP + +oc_frag_recon_inter_neon PROC + ; r0 = unsigned char *_dst + ; r1 = const unsigned char *_src + ; r2 = int _ystride + ; r3 = const ogg_int16_t _residue[64] + VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles + VLD1.64 {D0}, [r1], r2 + VLD1.64 {D2}, [r1], r2 + VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00 + VLD1.64 {D4}, [r1], r2 + VMOVL.U8 Q1, D2 ; etc + VLD1.64 {D6}, [r1], r2 + VMOVL.U8 Q2, D4 + VMOVL.U8 Q3, D6 + VQADD.S16 Q8, Q8, Q0 + VLD1.64 {D0}, [r1], r2 + VQADD.S16 Q9, Q9, Q1 + VLD1.64 {D2}, [r1], r2 + VQADD.S16 Q10,Q10,Q2 + VLD1.64 {D4}, [r1], r2 + VQADD.S16 Q11,Q11,Q3 + VLD1.64 {D6}, [r1], r2 + VMOVL.U8 Q0, D0 + VMOVL.U8 Q1, D2 + VMOVL.U8 Q2, D4 + VMOVL.U8 Q3, D6 + VQADD.S16 Q12,Q12,Q0 + VQADD.S16 Q13,Q13,Q1 + VQADD.S16 Q14,Q14,Q2 + VQADD.S16 Q15,Q15,Q3 + VQMOVUN.S16 D16,Q8 + VQMOVUN.S16 D17,Q9 + VQMOVUN.S16 D18,Q10 + VST1.64 {D16},[r0@64], r2 + VQMOVUN.S16 D19,Q11 + VST1.64 {D17},[r0@64], r2 + VQMOVUN.S16 D20,Q12 + VST1.64 {D18},[r0@64], r2 + VQMOVUN.S16 D21,Q13 + VST1.64 {D19},[r0@64], r2 + VQMOVUN.S16 D22,Q14 + VST1.64 {D20},[r0@64], r2 + VQMOVUN.S16 D23,Q15 + VST1.64 {D21},[r0@64], r2 + VST1.64 {D22},[r0@64], r2 + VST1.64 {D23},[r0@64], r2 + MOV PC,R14 + ENDP + +oc_frag_recon_inter2_neon PROC + ; r0 = unsigned char *_dst + ; r1 = const unsigned char *_src1 + ; r2 = const unsigned char *_src2 + ; r3 = int _ystride + LDR r12,[r13] + ; r12= const ogg_int16_t _residue[64] + VLDMIA r12,{D16-D31} + VLD1.64 {D0}, [r1], r3 + VLD1.64 {D4}, [r2], r3 + VLD1.64 {D1}, [r1], r3 + VLD1.64 {D5}, [r2], r3 + VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100 + VLD1.64 {D2}, [r1], r3 + VLD1.64 {D6}, [r2], r3 + VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00 + VLD1.64 {D3}, [r1], r3 + VMOVL.U8 Q2, D5 ; etc + VLD1.64 {D7}, [r2], r3 + VHADD.U8 Q3, Q1, Q3 + VQADD.S16 Q8, Q8, Q0 + VQADD.S16 Q9, Q9, Q2 + VLD1.64 {D0}, [r1], r3 + VMOVL.U8 Q1, D6 + VLD1.64 {D4}, [r2], r3 + VMOVL.U8 Q3, D7 + VLD1.64 {D1}, [r1], r3 + VQADD.S16 Q10,Q10,Q1 + VLD1.64 {D5}, [r2], r3 + VQADD.S16 Q11,Q11,Q3 + VLD1.64 {D2}, [r1], r3 + VHADD.U8 Q2, Q0, Q2 + VLD1.64 {D6}, [r2], r3 + VLD1.64 {D3}, [r1], r3 + VMOVL.U8 Q0, D4 + VLD1.64 {D7}, [r2], r3 + VMOVL.U8 Q2, D5 + VHADD.U8 Q3, Q1, Q3 + VQADD.S16 Q12,Q12,Q0 + VQADD.S16 Q13,Q13,Q2 + VMOVL.U8 Q1, D6 + VMOVL.U8 Q3, D7 + VQADD.S16 Q14,Q14,Q1 + VQADD.S16 Q15,Q15,Q3 + VQMOVUN.S16 D16,Q8 + VQMOVUN.S16 D17,Q9 + VQMOVUN.S16 D18,Q10 + VST1.64 {D16},[r0@64], r3 + VQMOVUN.S16 D19,Q11 + VST1.64 {D17},[r0@64], r3 + VQMOVUN.S16 D20,Q12 + VST1.64 {D18},[r0@64], r3 + VQMOVUN.S16 D21,Q13 + VST1.64 {D19},[r0@64], r3 + VQMOVUN.S16 D22,Q14 + VST1.64 {D20},[r0@64], r3 + VQMOVUN.S16 D23,Q15 + VST1.64 {D21},[r0@64], r3 + VST1.64 {D22},[r0@64], r3 + VST1.64 {D23},[r0@64], r3 + MOV PC,R14 + ENDP + ] + + END diff --git a/media/libtheora/lib/arm/armidct.s b/media/libtheora/lib/arm/armidct.s new file mode 100644 index 000000000..babd846ec --- /dev/null +++ b/media/libtheora/lib/arm/armidct.s @@ -0,0 +1,1914 @@ +;******************************************************************** +;* * +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * +;* * +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * +;* * +;******************************************************************** +; Original implementation: +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $ +;******************************************************************** + + AREA |.text|, CODE, READONLY + + ; Explicitly specifying alignment here because some versions of + ; gas don't align code correctly. See + ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html + ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 + ALIGN + + GET armopts.s + + EXPORT oc_idct8x8_1_arm + EXPORT oc_idct8x8_arm + +oc_idct8x8_1_arm PROC + ; r0 = ogg_int16_t *_y + ; r1 = ogg_uint16_t _dc + ORR r1, r1, r1, LSL #16 + MOV r2, r1 + MOV r3, r1 + MOV r12,r1 + STMIA r0!,{r1,r2,r3,r12} + STMIA r0!,{r1,r2,r3,r12} + STMIA r0!,{r1,r2,r3,r12} + STMIA r0!,{r1,r2,r3,r12} + STMIA r0!,{r1,r2,r3,r12} + STMIA r0!,{r1,r2,r3,r12} + STMIA r0!,{r1,r2,r3,r12} + STMIA r0!,{r1,r2,r3,r12} + MOV PC, r14 + ENDP + +oc_idct8x8_arm PROC + ; r0 = ogg_int16_t *_y + ; r1 = ogg_int16_t *_x + ; r2 = int _last_zzi + CMP r2, #3 + BLE oc_idct8x8_3_arm + CMP r2, #6 + BLE oc_idct8x8_6_arm + CMP r2, #10 + BLE oc_idct8x8_10_arm +oc_idct8x8_slow_arm + STMFD r13!,{r4-r11,r14} + SUB r13,r13,#64*2 +; Row transforms + STR r0, [r13,#-4]! + ADD r0, r13, #4 ; Write to temp storage. + BL idct8core_arm + BL idct8core_arm + BL idct8core_arm + BL idct8core_arm + BL idct8core_arm + BL idct8core_arm + BL idct8core_arm + BL idct8core_arm + LDR r0, [r13], #4 ; Write to the final destination. + ; Clear input data for next block (decoder only). + SUB r2, r1, #8*16 + CMP r0, r2 + MOV r1, r13 ; And read from temp storage. + BEQ oc_idct8x8_slow_arm_cols + MOV r4, #0 + MOV r5, #0 + MOV r6, #0 + MOV r7, #0 + STMIA r2!,{r4,r5,r6,r7} + STMIA r2!,{r4,r5,r6,r7} + STMIA r2!,{r4,r5,r6,r7} + STMIA r2!,{r4,r5,r6,r7} + STMIA r2!,{r4,r5,r6,r7} + STMIA r2!,{r4,r5,r6,r7} + STMIA r2!,{r4,r5,r6,r7} + STMIA r2!,{r4,r5,r6,r7} +oc_idct8x8_slow_arm_cols +; Column transforms + BL idct8core_down_arm + BL idct8core_down_arm + BL idct8core_down_arm + BL idct8core_down_arm + BL idct8core_down_arm + BL idct8core_down_arm + BL idct8core_down_arm + BL idct8core_down_arm + ADD r13,r13,#64*2 + LDMFD r13!,{r4-r11,PC} + ENDP + +oc_idct8x8_10_arm PROC + STMFD r13!,{r4-r11,r14} + SUB r13,r13,#64*2 +; Row transforms + MOV r2, r0 + MOV r0, r13 ; Write to temp storage. + BL idct4core_arm + BL idct3core_arm + BL idct2core_arm + BL idct1core_arm + ; Clear input data for next block (decoder only). + SUB r0, r1, #4*16 + CMP r0, r2 + MOV r1, r13 ; Read from temp storage. + BEQ oc_idct8x8_10_arm_cols + MOV r4, #0 + STR r4, [r0] + STR r4, [r0,#4] + STR r4, [r0,#16] + STR r4, [r0,#20] + STR r4, [r0,#32] + STR r4, [r0,#48] + MOV r0, r2 ; Write to the final destination +oc_idct8x8_10_arm_cols +; Column transforms + BL idct4core_down_arm + BL idct4core_down_arm + BL idct4core_down_arm + BL idct4core_down_arm + BL idct4core_down_arm + BL idct4core_down_arm + BL idct4core_down_arm + BL idct4core_down_arm + ADD r13,r13,#64*2 + LDMFD r13!,{r4-r11,PC} + ENDP + +oc_idct8x8_6_arm PROC + STMFD r13!,{r4-r7,r9-r11,r14} + SUB r13,r13,#64*2 +; Row transforms + MOV r2, r0 + MOV r0, r13 ; Write to temp storage. + BL idct3core_arm + BL idct2core_arm + BL idct1core_arm + ; Clear input data for next block (decoder only). + SUB r0, r1, #3*16 + CMP r0, r2 + MOV r1, r13 ; Read from temp storage. + BEQ oc_idct8x8_6_arm_cols + MOV r4, #0 + STR r4, [r0] + STR r4, [r0,#4] + STR r4, [r0,#16] + STR r4, [r0,#32] + MOV r0, r2 ; Write to the final destination +oc_idct8x8_6_arm_cols +; Column transforms + BL idct3core_down_arm + BL idct3core_down_arm + BL idct3core_down_arm + BL idct3core_down_arm + BL idct3core_down_arm + BL idct3core_down_arm + BL idct3core_down_arm + BL idct3core_down_arm + ADD r13,r13,#64*2 + LDMFD r13!,{r4-r7,r9-r11,PC} + ENDP + +oc_idct8x8_3_arm PROC + STMFD r13!,{r4-r7,r9-r11,r14} + SUB r13,r13,#64*2 +; Row transforms + MOV r2, r0 + MOV r0, r13 ; Write to temp storage. + BL idct2core_arm + BL idct1core_arm + ; Clear input data for next block (decoder only). + SUB r0, r1, #2*16 + CMP r0, r2 + MOV r1, r13 ; Read from temp storage. + MOVNE r4, #0 + STRNE r4, [r0] + STRNE r4, [r0,#16] + MOVNE r0, r2 ; Write to the final destination +; Column transforms + BL idct2core_down_arm + BL idct2core_down_arm + BL idct2core_down_arm + BL idct2core_down_arm + BL idct2core_down_arm + BL idct2core_down_arm + BL idct2core_down_arm + BL idct2core_down_arm + ADD r13,r13,#64*2 + LDMFD r13!,{r4-r7,r9-r11,PC} + ENDP + +idct1core_arm PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r3, [r1], #16 + MOV r12,#0x05 + ORR r12,r12,#0xB500 + MUL r3, r12, r3 + ; Stall ? + MOV r3, r3, ASR #16 + STRH r3, [r0], #2 + STRH r3, [r0, #14] + STRH r3, [r0, #30] + STRH r3, [r0, #46] + STRH r3, [r0, #62] + STRH r3, [r0, #78] + STRH r3, [r0, #94] + STRH r3, [r0, #110] + MOV PC,R14 + ENDP + +idct2core_arm PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r9, [r1], #16 ; r9 = x[0] + LDR r12,OC_C4S4 + LDRSH r11,[r1, #-14] ; r11= x[1] + LDR r3, OC_C7S1 + MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] + LDR r10,OC_C1S7 + MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] + MOV r9, r9, ASR #16 ; r9 = t[0] + MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] + MOV r3, r3, ASR #16 ; r3 = t[4] + MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] + MOV r11,r11,ASR #16 ; r11= t[7] + MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] + MOV r10,r10,ASR #16 ; r10= t[5] + ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6] + ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5] + SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5] + ADD r3, r3, r9 ; r3 = t[0]+t[4] + ADD r11,r11,r9 ; r11= t[0]+t[7] + STRH r11,[r0], #2 ; y[0] = t[0]+t[7] + STRH r12,[r0, #14] ; y[1] = t[0]+t[6] + STRH r10,[r0, #30] ; y[2] = t[0]+t[5] + STRH r3, [r0, #46] ; y[3] = t[0]+t[4] + RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4] + RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5] + RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6] + RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7] + STRH r3, [r0, #62] ; y[4] = t[0]-t[4] + STRH r10,[r0, #78] ; y[5] = t[0]-t[5] + STRH r12,[r0, #94] ; y[6] = t[0]-t[6] + STRH r11,[r0, #110] ; y[7] = t[0]-t[7] + MOV PC,r14 + ENDP + +idct2core_down_arm PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r9, [r1], #16 ; r9 = x[0] + LDR r12,OC_C4S4 + LDRSH r11,[r1, #-14] ; r11= x[1] + LDR r3, OC_C7S1 + MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] + LDR r10,OC_C1S7 + MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] + MOV r9, r9, ASR #16 ; r9 = t[0] + MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] + ADD r9, r9, #8 ; r9 = t[0]+8 + MOV r3, r3, ASR #16 ; r3 = t[4] + MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] + MOV r11,r11,ASR #16 ; r11= t[7] + MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] + MOV r10,r10,ASR #16 ; r10= t[5] + ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8 + ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8 + SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8 + ADD r3, r3, r9 ; r3 = t[0]+t[4]+8 + ADD r11,r11,r9 ; r11= t[0]+t[7]+8 + ; TODO: This is wrong. + ; The C code truncates to 16 bits by storing to RAM and doing the + ; shifts later; we've got an extra 4 bits here. + MOV r4, r11,ASR #4 + MOV r5, r12,ASR #4 + MOV r6, r10,ASR #4 + MOV r7, r3, ASR #4 + RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8 + RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8 + RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8 + RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8 + MOV r3, r3, ASR #4 + MOV r10,r10,ASR #4 + MOV r12,r12,ASR #4 + MOV r11,r11,ASR #4 + STRH r4, [r0], #2 ; y[0] = t[0]+t[7] + STRH r5, [r0, #14] ; y[1] = t[0]+t[6] + STRH r6, [r0, #30] ; y[2] = t[0]+t[5] + STRH r7, [r0, #46] ; y[3] = t[0]+t[4] + STRH r3, [r0, #62] ; y[4] = t[0]-t[4] + STRH r10,[r0, #78] ; y[5] = t[0]-t[5] + STRH r12,[r0, #94] ; y[6] = t[0]-t[6] + STRH r11,[r0, #110] ; y[7] = t[0]-t[7] + MOV PC,r14 + ENDP + +idct3core_arm PROC + LDRSH r9, [r1], #16 ; r9 = x[0] + LDR r12,OC_C4S4 ; r12= OC_C4S4 + LDRSH r3, [r1, #-12] ; r3 = x[2] + LDR r10,OC_C6S2 ; r10= OC_C6S2 + MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] + LDR r4, OC_C2S6 ; r4 = OC_C2S6 + MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] + LDRSH r11,[r1, #-14] ; r11= x[1] + MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] + LDR r4, OC_C7S1 ; r4 = OC_C7S1 + LDR r5, OC_C1S7 ; r5 = OC_C1S7 + MOV r9, r9, ASR #16 ; r9 = t[0] + MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] + ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3] + MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] + MOV r4, r4, ASR #16 ; r4 = t[4] + MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] + MOV r11,r11,ASR #16 ; r11= t[7] + MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] + ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2] + RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2] + ; r3 = t2[0] = t[0]+t[3] + RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3] + MOV r12,r12,ASR #16 ; r12= t[6] + ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] + RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] + ADD r11,r3, r11 ; r11= t2[0]+t[7] + ADD r5, r10,r5 ; r5 = t[1]+t2[6] + ADD r12,r6, r12 ; r12= t[2]+t2[5] + ADD r4, r9, r4 ; r4 = t2[3]+t[4] + STRH r11,[r0], #2 ; y[0] = t[0]+t[7] + STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] + STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] + STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] + RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7] + RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6] + RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5] + RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + STRH r4, [r0, #62] ; y[4] = t2[3]-t[4] + STRH r12,[r0, #78] ; y[5] = t[2]-t2[5] + STRH r5, [r0, #94] ; y[6] = t[1]-t2[6] + STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] + MOV PC,R14 + ENDP + +idct3core_down_arm PROC + LDRSH r9, [r1], #16 ; r9 = x[0] + LDR r12,OC_C4S4 ; r12= OC_C4S4 + LDRSH r3, [r1, #-12] ; r3 = x[2] + LDR r10,OC_C6S2 ; r10= OC_C6S2 + MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] + LDR r4, OC_C2S6 ; r4 = OC_C2S6 + MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] + LDRSH r11,[r1, #-14] ; r11= x[1] + MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] + LDR r4, OC_C7S1 ; r4 = OC_C7S1 + LDR r5, OC_C1S7 ; r5 = OC_C1S7 + MOV r9, r9, ASR #16 ; r9 = t[0] + MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] + ADD r9, r9, #8 ; r9 = t[0]+8 + MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] + ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8 + MOV r4, r4, ASR #16 ; r4 = t[4] + MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] + MOV r11,r11,ASR #16 ; r11= t[7] + MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] + ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8 + RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8 + ; r3 = t2[0]+8 = t[0]+t[3]+8 + RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8 + MOV r12,r12,ASR #16 ; r12= t[6] + ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] + RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] + ADD r11,r3, r11 ; r11= t2[0]+t[7] +8 + ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8 + ADD r12,r6, r12 ; r12= t[2] +t2[5]+8 + ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8 + RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8 + RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8 + RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8 + RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8 + ; TODO: This is wrong. + ; The C code truncates to 16 bits by storing to RAM and doing the + ; shifts later; we've got an extra 4 bits here. + MOV r11,r11,ASR #4 + MOV r5, r5, ASR #4 + MOV r12,r12,ASR #4 + MOV r4, r4, ASR #4 + MOV r9, r9, ASR #4 + MOV r6, r6, ASR #4 + MOV r10,r10,ASR #4 + MOV r3, r3, ASR #4 + STRH r11,[r0], #2 ; y[0] = t[0]+t[7] + STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] + STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] + STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] + STRH r9, [r0, #62] ; y[4] = t2[3]-t[4] + STRH r6, [r0, #78] ; y[5] = t[2]-t2[5] + STRH r10,[r0, #94] ; y[6] = t[1]-t2[6] + STRH r3, [r0, #110] ; y[7] = t2[0]-t[7] + MOV PC,R14 + ENDP + +idct4core_arm PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r9, [r1], #16 ; r9 = x[0] + LDR r10,OC_C4S4 ; r10= OC_C4S4 + LDRSH r12,[r1, #-12] ; r12= x[2] + LDR r4, OC_C6S2 ; r4 = OC_C6S2 + MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] + LDR r5, OC_C2S6 ; r5 = OC_C2S6 + MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] + LDRSH r3, [r1, #-14] ; r3 = x[1] + MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] + LDR r6, OC_C7S1 ; r6 = OC_C7S1 + LDR r12,OC_C1S7 ; r12= OC_C1S7 + LDRSH r11,[r1, #-10] ; r11= x[3] + MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] + LDR r7, OC_C5S3 ; r7 = OC_C5S3 + MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] + LDR r8, OC_C3S5 ; r8 = OC_C3S5 + MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] + MOV r9, r9, ASR #16 ; r9 = t[0] + MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] + MOV r6, r6, ASR #16 ; r6 = t[4] +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit +; before multiplying, not after (this is not equivalent) + SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) + RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] + MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) + MOV r3, r3, ASR #16 ; r3 = t[7] + ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] + RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] + MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) + ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + MOV r3, r3, ASR #16 ; r3 = t2[6] + ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] + RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] + ADD r11,r5, r11 ; r11= t[0]+t2[7] + ADD r6, r4, r6 ; r6 = t[1]+t3[6] + ADD r3, r10,r3 ; r3 = t[2]+t3[5] + ADD r7, r9, r7 ; r7 = t[3]+t2[4] + STRH r11,[r0], #2 ; y[0] = t[0]+t[7] + STRH r6, [r0, #14] ; y[1] = t[1]+t2[6] + STRH r3, [r0, #30] ; y[2] = t[2]+t2[5] + STRH r7, [r0, #46] ; y[3] = t2[3]+t[4] + RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7] + RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6] + RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5] + RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4] + STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] + STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] + STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] + STRH r11, [r0, #110] ; y[7] = t2[0]-t[7] + MOV PC,r14 + ENDP + +idct4core_down_arm PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r9, [r1], #16 ; r9 = x[0] + LDR r10,OC_C4S4 ; r10= OC_C4S4 + LDRSH r12,[r1, #-12] ; r12= x[2] + LDR r4, OC_C6S2 ; r4 = OC_C6S2 + MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] + LDR r5, OC_C2S6 ; r5 = OC_C2S6 + MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] + LDRSH r3, [r1, #-14] ; r3 = x[1] + MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] + LDR r6, OC_C7S1 ; r6 = OC_C7S1 + LDR r12,OC_C1S7 ; r12= OC_C1S7 + LDRSH r11,[r1, #-10] ; r11= x[3] + MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] + LDR r7, OC_C5S3 ; r7 = OC_C5S3 + MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] + LDR r8, OC_C3S5 ; r8 = OC_C3S5 + MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] + MOV r9, r9, ASR #16 ; r9 = t[0] + MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] + MOV r6, r6, ASR #16 ; r6 = t[4] +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit +; before multiplying, not after (this is not equivalent) + SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) + RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] + MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) + MOV r3, r3, ASR #16 ; r3 = t[7] + ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] + RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] + ADD r9, r9, #8 ; r9 = t[0]+8 + MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) + ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8 + RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8 + ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8 + RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8 + MOV r3, r3, ASR #16 ; r3 = t2[6] + ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] + RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] + ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8 + ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8 + ADD r10,r10,r3 ; r10= t[2]+t3[5]+8 + ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8 + SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8 + SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8 + SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8 + SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8 + ; TODO: This is wrong. + ; The C code truncates to 16 bits by storing to RAM and doing the + ; shifts later; we've got an extra 4 bits here. + MOV r11,r11,ASR #4 + MOV r6, r6, ASR #4 + MOV r3, r3, ASR #4 + MOV r7, r7, ASR #4 + MOV r9, r9, ASR #4 + MOV r10,r10,ASR #4 + MOV r4, r4, ASR #4 + MOV r5, r5, ASR #4 + STRH r5,[r0], #2 ; y[0] = t[0]+t[7] + STRH r4, [r0, #14] ; y[1] = t[1]+t2[6] + STRH r10,[r0, #30] ; y[2] = t[2]+t2[5] + STRH r9, [r0, #46] ; y[3] = t2[3]+t[4] + STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] + STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] + STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] + STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] + MOV PC,r14 + ENDP + +idct8core_arm PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r2, [r1],#16 ; r2 = x[0] + STMFD r13!,{r1,r14} + LDRSH r6, [r1, #-8] ; r6 = x[4] + LDR r12,OC_C4S4 ; r12= C4S4 + LDRSH r4, [r1, #-12] ; r4 = x[2] + ADD r2, r2, r6 ; r2 = x[0] + x[4] + SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] + ; For spec compliance, these sums must be truncated to 16-bit precision + ; _before_ the multiply (not after). + ; Sadly, ARMv4 provides no simple way to do that. + MOV r2, r2, LSL #16 + MOV r6, r6, LSL #16 + MOV r2, r2, ASR #16 + MOV r6, r6, ASR #16 + MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) + LDRSH r8, [r1, #-4] ; r8 = x[6] + LDR r7, OC_C6S2 ; r7 = OC_C6S2 + MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) + LDR r14,OC_C2S6 ; r14= OC_C2S6 + MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] + LDR r5, OC_C7S1 ; r5 = OC_C7S1 + MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] + MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 + MUL r14,r8, r14 ; r14= OC_C2S6*x[6] + MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 + MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] + LDR r7, OC_C1S7 ; r7 = OC_C1S7 + SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 + LDRSH r14,[r1, #-14] ; r14= x[1] + ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 + LDRSH r8, [r1, #-2] ; r8 = x[7] + MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] + LDRSH r10,[r1, #-6] ; r10= x[5] + MUL r14,r7, r14 ; r14= OC_C1S7*x[1] + MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 + MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] + MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 + MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] + LDRSH r1, [r1, #-10] ; r1 = x[3] + LDR r5, OC_C3S5 ; r5 = OC_C3S5 + LDR r11,OC_C5S3 ; r11= OC_C5S3 + ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 + MUL r14,r5, r10 ; r14= OC_C3S5*x[5] + SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 + MUL r10,r11,r10 ; r10= OC_C5S3*x[5] + MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 + MUL r11,r1, r11 ; r11= OC_C5S3*x[3] + MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 + MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] + SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 + ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 + ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] + ; r10=t[6] r12=C4S4 r14=t[5] +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit +; before multiplying, not after (this is not equivalent) + ; Stage 2 + ; 4-5 butterfly + ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] + SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] + MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) + ; 7-6 butterfly + ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] + SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] + MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) + ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] + ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 + ; Stage 3 + ; 0-3 butterfly + ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + ; 1-2 butterfly + ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + ; 6-5 butterfly + MOV r14,r14,ASR #16 ; r14= t2[5] + ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] + SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] + ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] + ; r10=t3[6] r14=t3[5] + ; Stage 4 + ADD r2, r2, r8 ; r2 = t[0] + t[7] + ADD r6, r6, r10 ; r6 = t[1] + t[6] + ADD r3, r3, r14 ; r3 = t[2] + t[5] + ADD r4, r4, r9 ; r4 = t[3] + t[4] + SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + STRH r2, [r0], #2 ; y[0] = t[0]+t[7] + STRH r6, [r0, #14] ; y[1] = t[1]+t[6] + STRH r3, [r0, #30] ; y[2] = t[2]+t[5] + STRH r4, [r0, #46] ; y[3] = t[3]+t[4] + STRH r9, [r0, #62] ; y[4] = t[3]-t[4] + STRH r14,[r0, #78] ; y[5] = t[2]-t[5] + STRH r10,[r0, #94] ; y[6] = t[1]-t[6] + STRH r8, [r0, #110] ; y[7] = t[0]-t[7] + LDMFD r13!,{r1,PC} + ENDP + +idct8core_down_arm PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r2, [r1],#16 ; r2 = x[0] + STMFD r13!,{r1,r14} + LDRSH r6, [r1, #-8] ; r6 = x[4] + LDR r12,OC_C4S4 ; r12= C4S4 + LDRSH r4, [r1, #-12] ; r4 = x[2] + ADD r2, r2, r6 ; r2 = x[0] + x[4] + SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] + ; For spec compliance, these sums must be truncated to 16-bit precision + ; _before_ the multiply (not after). + ; Sadly, ARMv4 provides no simple way to do that. + MOV r2, r2, LSL #16 + MOV r6, r6, LSL #16 + MOV r2, r2, ASR #16 + MOV r6, r6, ASR #16 + MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) + LDRSH r8, [r1, #-4] ; r8 = x[6] + LDR r7, OC_C6S2 ; r7 = OC_C6S2 + MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) + LDR r14,OC_C2S6 ; r14= OC_C2S6 + MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] + LDR r5, OC_C7S1 ; r5 = OC_C7S1 + MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] + MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 + MUL r14,r8, r14 ; r14= OC_C2S6*x[6] + MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 + MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] + LDR r7, OC_C1S7 ; r7 = OC_C1S7 + SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 + LDRSH r14,[r1, #-14] ; r14= x[1] + ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 + LDRSH r8, [r1, #-2] ; r8 = x[7] + MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] + LDRSH r10,[r1, #-6] ; r10= x[5] + MUL r14,r7, r14 ; r14= OC_C1S7*x[1] + MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 + MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] + MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 + MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] + LDRSH r1, [r1, #-10] ; r1 = x[3] + LDR r5, OC_C3S5 ; r5 = OC_C3S5 + LDR r11,OC_C5S3 ; r11= OC_C5S3 + ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 + MUL r14,r5, r10 ; r14= OC_C3S5*x[5] + SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 + MUL r10,r11,r10 ; r10= OC_C5S3*x[5] + MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 + MUL r11,r1, r11 ; r11= OC_C5S3*x[3] + MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 + MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] + SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 + ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 + ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] + ; r10=t[6] r12=C4S4 r14=t[5] + ; Stage 2 +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit +; before multiplying, not after (this is not equivalent) + ; 4-5 butterfly + ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] + SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] + MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) + ; 7-6 butterfly + ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] + SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] + MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) + ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] + ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 + ; Stage 3 + ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16 + ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16 + ; 0-3 butterfly + ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8 + SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8 + ; 1-2 butterfly + ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8 + SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8 + ; 6-5 butterfly + MOV r14,r14,ASR #16 ; r14= t2[5] + ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] + SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] + ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] + ; r10=t3[6] r14=t3[5] + ; Stage 4 + ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8 + ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8 + ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8 + ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8 + SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8 + SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8 + SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8 + SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8 + ; TODO: This is wrong. + ; The C code truncates to 16 bits by storing to RAM and doing the + ; shifts later; we've got an extra 4 bits here. + MOV r2, r2, ASR #4 + MOV r6, r6, ASR #4 + MOV r3, r3, ASR #4 + MOV r4, r4, ASR #4 + MOV r8, r8, ASR #4 + MOV r10,r10,ASR #4 + MOV r14,r14,ASR #4 + MOV r9, r9, ASR #4 + STRH r2, [r0], #2 ; y[0] = t[0]+t[7] + STRH r6, [r0, #14] ; y[1] = t[1]+t[6] + STRH r3, [r0, #30] ; y[2] = t[2]+t[5] + STRH r4, [r0, #46] ; y[3] = t[3]+t[4] + STRH r9, [r0, #62] ; y[4] = t[3]-t[4] + STRH r14,[r0, #78] ; y[5] = t[2]-t[5] + STRH r10,[r0, #94] ; y[6] = t[1]-t[6] + STRH r8, [r0, #110] ; y[7] = t[0]-t[7] + LDMFD r13!,{r1,PC} + ENDP + + [ OC_ARM_ASM_MEDIA + EXPORT oc_idct8x8_1_v6 + EXPORT oc_idct8x8_v6 + +oc_idct8x8_1_v6 PROC + ; r0 = ogg_int16_t *_y + ; r1 = ogg_uint16_t _dc + ORR r2, r1, r1, LSL #16 + ORR r3, r1, r1, LSL #16 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + STRD r2, [r0], #8 + MOV PC, r14 + ENDP + +oc_idct8x8_v6 PROC + ; r0 = ogg_int16_t *_y + ; r1 = ogg_int16_t *_x + ; r2 = int _last_zzi + CMP r2, #3 + BLE oc_idct8x8_3_v6 + ;CMP r2, #6 + ;BLE oc_idct8x8_6_v6 + CMP r2, #10 + BLE oc_idct8x8_10_v6 +oc_idct8x8_slow_v6 + STMFD r13!,{r4-r11,r14} + SUB r13,r13,#64*2 +; Row transforms + STR r0, [r13,#-4]! + ADD r0, r13, #4 ; Write to temp storage. + BL idct8_8core_v6 + BL idct8_8core_v6 + BL idct8_8core_v6 + BL idct8_8core_v6 + LDR r0, [r13], #4 ; Write to the final destination. + ; Clear input data for next block (decoder only). + SUB r2, r1, #8*16 + CMP r0, r2 + MOV r1, r13 ; And read from temp storage. + BEQ oc_idct8x8_slow_v6_cols + MOV r4, #0 + MOV r5, #0 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 + STRD r4, [r2], #8 +oc_idct8x8_slow_v6_cols +; Column transforms + BL idct8_8core_down_v6 + BL idct8_8core_down_v6 + BL idct8_8core_down_v6 + BL idct8_8core_down_v6 + ADD r13,r13,#64*2 + LDMFD r13!,{r4-r11,PC} + ENDP + +oc_idct8x8_10_v6 PROC + STMFD r13!,{r4-r11,r14} + SUB r13,r13,#64*2+4 +; Row transforms + MOV r2, r13 + STR r0, [r13,#-4]! + AND r0, r2, #4 ; Align the stack. + ADD r0, r0, r2 ; Write to temp storage. + BL idct4_3core_v6 + BL idct2_1core_v6 + LDR r0, [r13], #4 ; Write to the final destination. + ; Clear input data for next block (decoder only). + SUB r2, r1, #4*16 + CMP r0, r2 + AND r1, r13,#4 ; Align the stack. + BEQ oc_idct8x8_10_v6_cols + MOV r4, #0 + MOV r5, #0 + STRD r4, [r2] + STRD r4, [r2,#16] + STR r4, [r2,#32] + STR r4, [r2,#48] +oc_idct8x8_10_v6_cols +; Column transforms + ADD r1, r1, r13 ; And read from temp storage. + BL idct4_4core_down_v6 + BL idct4_4core_down_v6 + BL idct4_4core_down_v6 + BL idct4_4core_down_v6 + ADD r13,r13,#64*2+4 + LDMFD r13!,{r4-r11,PC} + ENDP + +oc_idct8x8_3_v6 PROC + STMFD r13!,{r4-r8,r14} + SUB r13,r13,#64*2 +; Row transforms + MOV r8, r0 + MOV r0, r13 ; Write to temp storage. + BL idct2_1core_v6 + ; Clear input data for next block (decoder only). + SUB r0, r1, #2*16 + CMP r0, r8 + MOV r1, r13 ; Read from temp storage. + MOVNE r4, #0 + STRNE r4, [r0] + STRNE r4, [r0,#16] + MOVNE r0, r8 ; Write to the final destination. +; Column transforms + BL idct2_2core_down_v6 + BL idct2_2core_down_v6 + BL idct2_2core_down_v6 + BL idct2_2core_down_v6 + ADD r13,r13,#64*2 + LDMFD r13!,{r4-r8,PC} + ENDP + +idct2_1core_v6 PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) +; Stage 1: + LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> + LDR r3, OC_C4S4 + LDRSH r6, [r1], #16 ; r6 = x[1,0] + SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 + LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 + SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16 + SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 + SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 +; Stage 2: + SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 + PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]> + SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 + PKHBT r7, r7, r3 ; r7 = <0|t[0,7]> +; Stage 3: + PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]> + PKHBT r4, r4, r3 ; r4 = <0|t[0,4]> + SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]> +; Stage 4: + PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]> + PKHBT r5, r5, r3 ; r5 = <0|t[0,5]> + SADD16 r3, r12,r7 ; r3 = t[0]+t[7] + STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7] + SADD16 r3, r12,r6 ; r3 = t[0]+t[6] + STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6] + SADD16 r3, r12,r5 ; r3 = t[0]+t[5] + STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5] + SADD16 r3, r12,r4 ; r3 = t[0]+t[4] + STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4] + SSUB16 r4, r12,r4 ; r4 = t[0]-t[4] + STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4] + SSUB16 r5, r12,r5 ; r5 = t[0]-t[5] + STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5] + SSUB16 r6, r12,r6 ; r6 = t[0]-t[6] + STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6] + SSUB16 r7, r12,r7 ; r7 = t[0]-t[7] + STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] + MOV PC,r14 + ENDP + ] + + ALIGN 8 +OC_C7S1 + DCD 12785 ; 31F1 +OC_C1S7 + DCD 64277 ; FB15 +OC_C6S2 + DCD 25080 ; 61F8 +OC_C2S6 + DCD 60547 ; EC83 +OC_C5S3 + DCD 36410 ; 8E3A +OC_C3S5 + DCD 54491 ; D4DB +OC_C4S4 + DCD 46341 ; B505 + + [ OC_ARM_ASM_MEDIA +idct2_2core_down_v6 PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) +; Stage 1: + LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> + LDR r3, OC_C4S4 + MOV r7 ,#8 ; r7 = 8 + LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]> + SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8 + LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 + SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8 + SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16 + PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> + SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 +; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition. + PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]> +; Stage 2: + SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 + PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]> + SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16 + SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 + PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]> + SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16 + PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]> +; Stage 3: + SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]> + SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]> +; Stage 4: + SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8 + MOV r3, r2, ASR #4 + MOV r2, r2, LSL #16 + PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4 + STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 + SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8 + MOV r3, r2, ASR #4 + MOV r2, r2, LSL #16 + PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4 + STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4 + SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8 + MOV r3, r2, ASR #4 + MOV r2, r2, LSL #16 + PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4 + STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4 + SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8 + MOV r3, r2, ASR #4 + MOV r2, r2, LSL #16 + PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4 + STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4 + SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8 + MOV r3, r4, ASR #4 + MOV r4, r4, LSL #16 + PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4 + STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4 + SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8 + MOV r3, r5, ASR #4 + MOV r5, r5, LSL #16 + PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4 + STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4 + SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8 + MOV r3, r6, ASR #4 + MOV r6, r6, LSL #16 + PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4 + STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4 + SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8 + MOV r3, r7, ASR #4 + MOV r7, r7, LSL #16 + PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4 + STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 + MOV PC,r14 + ENDP + +; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to +; pay for increased branch mis-prediction to get here, but in practice it +; doesn't seem to slow anything down to take it out, and it's less code this +; way. + [ 0 +oc_idct8x8_6_v6 PROC + STMFD r13!,{r4-r8,r10,r11,r14} + SUB r13,r13,#64*2+4 +; Row transforms + MOV r8, r0 + AND r0, r13,#4 ; Align the stack. + ADD r0, r0, r13 ; Write to temp storage. + BL idct3_2core_v6 + BL idct1core_v6 + ; Clear input data for next block (decoder only). + SUB r0, r1, #3*16 + CMP r0, r8 + AND r1, r13,#4 ; Align the stack. + BEQ oc_idct8x8_6_v6_cols + MOV r4, #0 + MOV r5, #0 + STRD r4, [r0] + STR r4, [r0,#16] + STR r4, [r0,#32] + MOV r0, r8 ; Write to the final destination. +oc_idct8x8_6_v6_cols +; Column transforms + ADD r1, r1, r13 ; And read from temp storage. + BL idct3_3core_down_v6 + BL idct3_3core_down_v6 + BL idct3_3core_down_v6 + BL idct3_3core_down_v6 + ADD r13,r13,#64*2+4 + LDMFD r13!,{r4-r8,r10,r11,PC} + ENDP + +idct1core_v6 PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) + LDRSH r3, [r1], #16 + MOV r12,#0x05 + ORR r12,r12,#0xB500 + MUL r3, r12, r3 + ; Stall ? + MOV r3, r3, ASR #16 + ; Don't need to actually store the odd lines; they won't be read. + STRH r3, [r0], #2 + STRH r3, [r0, #30] + STRH r3, [r0, #62] + STRH r3, [r0, #94] + MOV PC,R14 + ENDP + +idct3_2core_v6 PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) +; Stage 1: + LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]> + LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6 + ; Stall + SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 + LDR r11,OC_C4S4 + SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 + LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]> + SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16) + LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 + SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16) + PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]> + SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16 + PKHBT r2, r2, r11 ; r2 = <0|t[0,2]> + SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 + PKHBT r3, r3, r11 ; r3 = <0|t[0,3]> + SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16 + PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]> + SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 +; Stage 2: + SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 + PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]> + SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 + SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 + PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> + SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 +; Stage 3: + B idct4_3core_stage3_v6 + ENDP + +; Another copy so the LDRD offsets are less than +/- 255. + ALIGN 8 +OC_C7S1_3_v6 + DCD 12785 ; 31F1 +OC_C1S7_3_v6 + DCD 64277 ; FB15 +OC_C6S2_3_v6 + DCD 25080 ; 61F8 +OC_C2S6_3_v6 + DCD 60547 ; EC83 + +idct3_3core_down_v6 PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) +; Stage 1: + LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]> + LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 + LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]> + SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 + MOV r7,#8 + SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 + LDR r11,OC_C4S4 + SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 +; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition. + PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]> + SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 + PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]> + LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 + PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> + SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 + SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 + SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 + PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> + SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 +; Stage 2: + SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 + PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> + SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 + SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 + PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> + SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 +; Stage 3: + B idct4_4core_down_stage3_v6 + ENDP + ] + +idct4_3core_v6 PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) +; Stage 1: + LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> + LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 + LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]> + SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 + SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 + PKHBT r9, r9, r2 ; r9 = <0|t[0,6]> + LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 + PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]> + SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 + SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 + LDR r11,OC_C4S4 + SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 + SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 + PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> + SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 + PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> + SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16 + LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 + PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]> + SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 + SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 + SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 + PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> + SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 +; Stage 2: + SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] + PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> + SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] + SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 + SADD16 r5, r4, r8 ; r5 = t[4]-t[5] + SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 + SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] + SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 + PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> + SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 +; Stage 3: +idct4_3core_stage3_v6 + SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2] + PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> + SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2] +idct4_3core_stage3_5_v6 + SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] + SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] + SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3] + SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3] +; Stage 4: + SADD16 r12,r10,r7 ; r12= t[0]+t[7] + STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7] + SADD16 r12,r11,r6 ; r12= t[1]+t[6] + STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6] + SADD16 r12,r2, r5 ; r12= t[2]+t[5] + STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5] + SADD16 r12,r3, r4 ; r12= t[3]+t[4] + STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4] + SSUB16 r4, r3, r4 ; r4 = t[3]-t[4] + STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4] + SSUB16 r5, r2, r5 ; r5 = t[2]-t[5] + STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5] + SSUB16 r6, r11,r6 ; r6 = t[1]-t[6] + STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6] + SSUB16 r7, r10,r7 ; r7 = t[0]-t[7] + STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] + MOV PC,r14 + ENDP + +; Another copy so the LDRD offsets are less than +/- 255. + ALIGN 8 +OC_C7S1_4_v6 + DCD 12785 ; 31F1 +OC_C1S7_4_v6 + DCD 64277 ; FB15 +OC_C6S2_4_v6 + DCD 25080 ; 61F8 +OC_C2S6_4_v6 + DCD 60547 ; EC83 +OC_C5S3_4_v6 + DCD 36410 ; 8E3A +OC_C3S5_4_v6 + DCD 54491 ; D4DB + +idct4_4core_down_v6 PROC + ; r0 = ogg_int16_t *_y (destination) + ; r1 = const ogg_int16_t *_x (source) +; Stage 1: + LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> + LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 + LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]> + SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 + LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 + SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 +; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition. + PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]> + SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 + PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]> + SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 + LDR r11,OC_C4S4 + SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 + MOV r7,#8 + SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 + PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> + SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 + PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> + SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 + LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 + PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> + SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 + SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 + SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 + PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> + SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 +; Stage 2: + SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] + PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> + SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] + SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 + SADD16 r5, r4, r8 ; r5 = t[4]-t[5] + SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 + SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] + SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 + PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> + SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 +; Stage 3: +idct4_4core_down_stage3_v6 + SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8 + PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> + SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8 + B idct8_8core_down_stage3_5_v6 + ENDP + +idct8_8core_v6 PROC + STMFD r13!,{r0,r14} +; Stage 1: + ;5-6 rotation by 3pi/16 + LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5 + LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> + LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> + SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 + LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> + SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 + LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> + SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 + SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 + SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) + PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> + SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) + PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> + SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 + PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> + SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 + ;2-3 rotation by 6pi/16 + LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6 + PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> + LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> + SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 + SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> + SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 + LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> + SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 + SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 + PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> + SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) + SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) + SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 + PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> + SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 + ;4-7 rotation by 7pi/16 + LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 + PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> + LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> + PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> + SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> + SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 + LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> + SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 + SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 + SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 + SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) + PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> + SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) + PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> + SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 + PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> + SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 + ;0-1 butterfly + LDR r11,OC_C4S4 + PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> + SADD16 r7, r0, r4 ; r7 = x[0]+x[4] + SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> + SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] + SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16 + SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16 + SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16 + PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]> + SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16 +; Stage 2: + SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] + PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]> + SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] + SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 + SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] + SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 + SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] + SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 + PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> + SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 +; Stage 3: + SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2] + PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> + SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2] + LDMFD r13!,{r0,r14} + B idct4_3core_stage3_5_v6 + ENDP + +; Another copy so the LDRD offsets are less than +/- 255. + ALIGN 8 +OC_C7S1_8_v6 + DCD 12785 ; 31F1 +OC_C1S7_8_v6 + DCD 64277 ; FB15 +OC_C6S2_8_v6 + DCD 25080 ; 61F8 +OC_C2S6_8_v6 + DCD 60547 ; EC83 +OC_C5S3_8_v6 + DCD 36410 ; 8E3A +OC_C3S5_8_v6 + DCD 54491 ; D4DB + +idct8_8core_down_v6 PROC + STMFD r13!,{r0,r14} +; Stage 1: + ;5-6 rotation by 3pi/16 + LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5 + LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> + LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> + SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 + LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> + SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 + LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> + SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 + SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 + SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) + PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> + SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) + PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> + SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 + PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> + SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 + ;2-3 rotation by 6pi/16 + LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6 + PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> + LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> + SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 + SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> + SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 + LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> + SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 + SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 + PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> + SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) + SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) + SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 + PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> + SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 + ;4-7 rotation by 7pi/16 + LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 + PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> + LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> + PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> + SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> + SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 + LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> + SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 + SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 + SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 + SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) + PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> + SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) + PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> + SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 + PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> + SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 + ;0-1 butterfly + LDR r11,OC_C4S4 + MOV r14,#8 + PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> + SADD16 r7, r0, r4 ; r7 = x[0]+x[4] + SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> + SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8 + SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] + SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8 + SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8 + PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> + SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8 +; Stage 2: + SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] + PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8> + SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] + SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 + SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] + SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 + SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] + SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 + PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> + SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 +; Stage 3: + SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8 + PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> + SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8 + LDMFD r13!,{r0,r14} +idct8_8core_down_stage3_5_v6 + SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] + SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] + SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8 + SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8 +; Stage 4: + SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8 + SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8 + MOV r10,r12,ASR #4 + MOV r12,r12,LSL #16 + PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4 + STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 + SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8 + SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8 + MOV r10,r12,ASR #4 + MOV r12,r12,LSL #16 + PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4 + STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4 + SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8 + SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8 + MOV r10,r12,ASR #4 + MOV r12,r12,LSL #16 + PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4 + STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4 + SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8 + SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8 + MOV r10,r12,ASR #4 + MOV r12,r12,LSL #16 + PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4 + STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4 + MOV r10,r4, ASR #4 + MOV r4, r4, LSL #16 + PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4 + STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4 + MOV r10,r5, ASR #4 + MOV r5, r5, LSL #16 + PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4 + STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4 + MOV r10,r6, ASR #4 + MOV r6, r6, LSL #16 + PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4 + STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4 + MOV r10,r7, ASR #4 + MOV r7, r7, LSL #16 + PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4 + STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 + MOV PC,r14 + ENDP + ] + + [ OC_ARM_ASM_NEON + EXPORT oc_idct8x8_1_neon + EXPORT oc_idct8x8_neon + + ALIGN 16 +OC_IDCT_CONSTS_NEON + DCW 8 + DCW 64277 ; FB15 (C1S7) + DCW 60547 ; EC83 (C2S6) + DCW 54491 ; D4DB (C3S5) + DCW 46341 ; B505 (C4S4) + DCW 36410 ; 471D (C5S3) + DCW 25080 ; 30FC (C6S2) + DCW 12785 ; 31F1 (C7S1) + +oc_idct8x8_1_neon PROC + ; r0 = ogg_int16_t *_y + ; r1 = ogg_uint16_t _dc + VDUP.S16 Q0, r1 + VMOV Q1, Q0 + VST1.64 {D0, D1, D2, D3}, [r0@128]! + VST1.64 {D0, D1, D2, D3}, [r0@128]! + VST1.64 {D0, D1, D2, D3}, [r0@128]! + VST1.64 {D0, D1, D2, D3}, [r0@128] + MOV PC, r14 + ENDP + +oc_idct8x8_neon PROC + ; r0 = ogg_int16_t *_y + ; r1 = ogg_int16_t *_x + ; r2 = int _last_zzi + CMP r2, #10 + BLE oc_idct8x8_10_neon +oc_idct8x8_slow_neon + VPUSH {D8-D15} + MOV r2, r1 + ADR r3, OC_IDCT_CONSTS_NEON + ; Row transforms (input is pre-transposed) + VLD1.64 {D16,D17,D18,D19}, [r2@128]! + VLD1.64 {D20,D21,D22,D23}, [r2@128]! + VLD1.64 {D24,D25,D26,D27}, [r2@128]! + VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] + VLD1.64 {D28,D29,D30,D31}, [r2@128] + VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] + VLD1.64 {D0,D1}, [r3@128] + MOV r12, r14 + BL oc_idct8x8_stage123_neon +; Stage 4 + VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' + VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' + VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' + VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' + VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' + VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' + VTRN.16 Q14,Q15 + VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' + VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' + ; 8x8 Transpose + VTRN.16 Q8, Q9 + VTRN.16 Q10,Q11 + VTRN.16 Q12,Q13 + VTRN.32 Q8, Q10 + VTRN.32 Q9, Q11 + VTRN.32 Q12,Q14 + VTRN.32 Q13,Q15 + VSWP D17,D24 + VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] + VSWP D19,D26 + VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] + VSWP D21,D28 + VSWP D23,D30 + ; Column transforms + BL oc_idct8x8_stage123_neon + CMP r0,r1 + ; We have to put the return address back in the LR, or the branch + ; predictor will not recognize the function return and mis-predict the + ; entire call stack. + MOV r14, r12 +; Stage 4 + VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' + VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' + VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' + VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' + VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' + VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' + VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' + VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' + BEQ oc_idct8x8_slow_neon_noclear + VMOV.I8 Q2,#0 + VPOP {D8-D15} + VMOV.I8 Q3,#0 + VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 + VST1.64 {D4, D5, D6, D7}, [r1@128]! + VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 + VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 + VST1.64 {D4, D5, D6, D7}, [r1@128]! + VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 + VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 + VST1.64 {D4, D5, D6, D7}, [r1@128]! + VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 + VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 + VST1.64 {D4, D5, D6, D7}, [r1@128] + VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 + VSTMIA r0, {D16-D31} + MOV PC, r14 + +oc_idct8x8_slow_neon_noclear + VPOP {D8-D15} + VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 + VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 + VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 + VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 + VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 + VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 + VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 + VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 + VSTMIA r0, {D16-D31} + MOV PC, r14 + ENDP + +oc_idct8x8_stage123_neon PROC +; Stages 1 & 2 + VMULL.S16 Q4, D18,D1[3] + VMULL.S16 Q5, D19,D1[3] + VMULL.S16 Q7, D30,D1[3] + VMULL.S16 Q6, D31,D1[3] + VMULL.S16 Q2, D30,D0[1] + VMULL.S16 Q3, D31,D0[1] + VSHRN.S32 D8, Q4, #16 + VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16) + VSHRN.S32 D14,Q7, #16 + VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16) + VSHRN.S32 D4, Q2, #16 + VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7] + VSUB.S16 Q4, Q4, Q15 + VADD.S16 Q7, Q7, Q9 + VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4] + VMULL.S16 Q2, D18,D0[1] + VMULL.S16 Q9, D19,D0[1] + VMULL.S16 Q5, D26,D0[3] + VMULL.S16 Q3, D27,D0[3] + VMULL.S16 Q6, D22,D0[3] + VMULL.S16 Q12,D23,D0[3] + VSHRN.S32 D4, Q2, #16 + VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1] + VSHRN.S32 D10,Q5, #16 + VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5] + VSHRN.S32 D12,Q6, #16 + VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3] + VADD.S16 Q7, Q7, Q2 ; Q7 = t[7] + VSUB.S16 Q5, Q5, Q11 + VADD.S16 Q6, Q6, Q11 + VADD.S16 Q5, Q5, Q13 + VADD.S16 Q6, Q6, Q13 + VMULL.S16 Q9, D22,D1[1] + VMULL.S16 Q11,D23,D1[1] + VMULL.S16 Q15,D26,D1[1] + VMULL.S16 Q13,D27,D1[1] + VMULL.S16 Q2, D20,D1[2] + VMULL.S16 Q12,D21,D1[2] + VSHRN.S32 D18,Q9, #16 + VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3] + VSHRN.S32 D30,Q15,#16 + VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5] + VSHRN.S32 D4, Q2, #16 + VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16) + VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5] + VADD.S16 Q6, Q6, Q15 ; Q6 = t[6] + VSUB.S16 Q2, Q2, Q14 + VMULL.S16 Q3, D28,D1[2] + VMULL.S16 Q11,D29,D1[2] + VMULL.S16 Q12,D28,D0[2] + VMULL.S16 Q9, D29,D0[2] + VMULL.S16 Q13,D20,D0[2] + VMULL.S16 Q15,D21,D0[2] + VSHRN.S32 D6, Q3, #16 + VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16) + VSHRN.S32 D24,Q12,#16 + VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6] + VSHRN.S32 D26,Q13,#16 + VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2] + VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5] + VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6] + VADD.S16 Q3, Q3, Q10 + VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5] + VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6] + VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2] + VADD.S16 Q3, Q3, Q13 ; Q3 = t[3] + VMULL.S16 Q12,D16,D1[0] + VMULL.S16 Q13,D17,D1[0] + VMULL.S16 Q14,D2, D1[0] + VMULL.S16 Q15,D3, D1[0] + VMULL.S16 Q5, D18,D1[0] + VMULL.S16 Q6, D22,D1[0] + VSHRN.S32 D24,Q12,#16 + VSHRN.S32 D25,Q13,#16 + VSHRN.S32 D28,Q14,#16 + VSHRN.S32 D29,Q15,#16 + VMULL.S16 Q13,D19,D1[0] + VMULL.S16 Q15,D23,D1[0] + VADD.S16 Q8, Q8, Q12 ; Q8 = t[0] + VADD.S16 Q1, Q1, Q14 ; Q1 = t[1] + VSHRN.S32 D10,Q5, #16 + VSHRN.S32 D12,Q6, #16 + VSHRN.S32 D11,Q13,#16 + VSHRN.S32 D13,Q15,#16 + VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16 + VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16 +; Stage 3 + VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3] + VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3] + VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2] + VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]' + VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2] + VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]' + MOV PC, r14 + ENDP + +oc_idct8x8_10_neon PROC + ADR r3, OC_IDCT_CONSTS_NEON + VLD1.64 {D0,D1}, [r3@128] + MOV r2, r1 + ; Row transforms (input is pre-transposed) +; Stage 1 + VLD1.64 {D16,D17,D18,D19},[r2@128]! + MOV r12, #16 + VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16) + VLD1.64 {D17}, [r2@64], r12 + VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16) + VLD1.64 {D19}, [r2@64] + VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16) + VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16) + VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16) + VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1] + VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2] + VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0] + VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1] + VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2] + VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3] + VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3] + VSHRN.S32 D5, Q12,#16 ; D5 = t[4] + VSHRN.S32 D2, Q1, #16 ; D2 = t[2] + VADD.S16 D4, D4, D18 ; D4 = t[7] + VADD.S16 D6, D6, D19 ; D6 = t[6] + VADD.S16 D7, D7, D19 ; D7 = -t[5] + VADD.S16 Q15,Q15,Q8 ; D30= t[0] + ; D31= t[3] +; Stages 2 & 3 + VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6] + ; D25= t[4]'=t[4]+t[5] + VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6] + ; D27= t[4]-t[5] + VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6]) + ; -(t[7]-t[6]<<16) + VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5]) + ; -(t[4]-t[5]<<16) + VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3] + VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2] + VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2] + VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16) + ; -(t[7]-t[6]) + VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16) + ; -(t[4]-t[5]) + VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3] + VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16 + VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16 + VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]' + VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]' +; Stage 4 + VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]' + ; D23= y[5]=t[2]'-t[5]'' + VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]' + ; D21= y[4]=t[3]'-t[4]'' + VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]' + ; D17= y[2]=t[2]'+t[5]'' + VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]' + ; D19= y[3]=t[3]'-t[4]'' + ; 8x4 transpose + VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6 + ; Q11= d5d4b5b4 d7d6b7b6 + VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0 + ; Q9 = d3d2b3b2 d1d0b1b0 + VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4 + VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4 + VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0 + ; Q11= d7d6d5d4 d3d2d1d0 + VMULL.S16 Q15,D18,D0[1] + VMULL.S16 Q13,D22,D1[1] + VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0 + ; Q10= c7c6c5c4 c3c2c1c0 + ; Column transforms +; Stages 1, 2, & 3 + VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16) + VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16) + VMULL.S16 Q3, D22,D0[3] + VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16) + VSHRN.S32 D30,Q15,#16 + VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1] + VSHRN.S32 D26,Q13,#16 + VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3] + VSHRN.S32 D28,Q3, #16 + VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3] + VADD.S16 Q15,Q15,Q9 ; Q15= t[7] + VADD.S16 Q13,Q13,Q11 ; Q13= -t[5] + VADD.S16 Q14,Q14,Q11 ; Q14= t[6] + VMULL.S16 Q12,D18,D1[3] + VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1] + VMULL.S16 Q1, D16,D1[0] + VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16) + VMULL.S16 Q3, D20,D0[2] + VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16) + VSHRN.S32 D24,Q12,#16 + VSHRN.S32 D25,Q2, #16 ; Q12= t[4] + VMULL.S16 Q2, D20,D1[2] + VSHRN.S32 D2, Q1, #16 + VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0] + VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2] + VSHRN.S32 D6, Q3, #16 + VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2] + VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6] + VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6] + VSHRN.S32 D4, Q2, #16 + VSHRN.S32 D5, Q11,#16 ; Q2 = t[2] + VADD.S16 Q1, Q1, Q8 ; Q1 = t[0] + VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5] + VADD.S16 Q3, Q3, Q10 ; Q3 = t[3] + VMULL.S16 Q10,D16,D1[0] + VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5]) + ; -(t[4]-t[5]<<16) + VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5] + VMULL.S16 Q14,D18,D1[0] + VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7]) + ; -(t[6]-t[7]<<16) + VSHRN.S32 D20,Q10,#16 + VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16) + ; -(t[4]-t[5]) + VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3] + VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3] + VSHRN.S32 D28,Q14,#16 + VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16) + ; -(t[7]-t[6]) + VADD.S16 Q10,Q10,Q8 ; Q10=t[5]' + VADD.S16 Q14,Q14,Q9 ; Q14=t[6]' + VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]' + VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]' + VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2] + VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2] +; Stage 4 + CMP r0, r1 + VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]' + VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]'' + VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]' + VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]'' + VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]'' + VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]' + VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]' + VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]'' + BEQ oc_idct8x8_10_neon_noclear + VMOV.I8 D2, #0 + VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 + VST1.64 {D2}, [r1@64], r12 + VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 + VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 + VST1.64 {D2}, [r1@64], r12 + VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 + VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 + VST1.64 {D2}, [r1@64], r12 + VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 + VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 + VST1.64 {D2}, [r1@64] + VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 + VSTMIA r0, {D16-D31} + MOV PC, r14 + +oc_idct8x8_10_neon_noclear + VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 + VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 + VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 + VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 + VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 + VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 + VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 + VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 + VSTMIA r0, {D16-D31} + MOV PC, r14 + ENDP + ] + + END diff --git a/media/libtheora/lib/arm/armint.h b/media/libtheora/lib/arm/armint.h new file mode 100644 index 000000000..cc62d2438 --- /dev/null +++ b/media/libtheora/lib/arm/armint.h @@ -0,0 +1,126 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $ + + ********************************************************************/ +#if !defined(_arm_armint_H) +# define _arm_armint_H (1) +# include "../internal.h" + +# if defined(OC_ARM_ASM) + +# if defined(__ARMEB__) +# error "Big-endian configurations are not supported by the ARM asm. " \ + "Reconfigure with --disable-asm or undefine OC_ARM_ASM." +# endif + +# define oc_state_accel_init oc_state_accel_init_arm +/*This function is implemented entirely in asm, so it's helpful to pull out all + of the things that depend on structure offsets. + We reuse the function pointer with the wrong prototype, though.*/ +# define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \ + _fragy0,_fragy_end) \ + ((oc_loop_filter_frag_rows_arm_func) \ + (_state)->opt_vtable.state_loop_filter_frag_rows)( \ + (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \ + (_bv), \ + (_state)->frags, \ + (_state)->fplanes[(_pli)].froffset \ + +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \ + (_state)->fplanes[(_pli)].froffset \ + +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \ + (_state)->fplanes[(_pli)].froffset, \ + (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \ + (_state)->frag_buf_offs, \ + (_state)->fplanes[(_pli)].nhfrags) +/*For everything else the default vtable macros are fine.*/ +# define OC_STATE_USE_VTABLE (1) +# endif + +# include "../state.h" +# include "armcpu.h" + +# if defined(OC_ARM_ASM) +typedef void (*oc_loop_filter_frag_rows_arm_func)( + unsigned char *_ref_frame_data,int _ystride,signed char _bv[256], + const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end, + ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, + const ptrdiff_t *_frag_buf_offs,int _nhfrags); + +void oc_state_accel_init_arm(oc_theora_state *_state); +void oc_frag_copy_list_arm(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); +void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride, + const ogg_int16_t *_residue); +void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src, + int _ystride,const ogg_int16_t *_residue); +void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); +void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc); +void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); +void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data, + int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0, + ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, + const ptrdiff_t *_frag_buf_offs,int _nhfrags); + +# if defined(OC_ARM_ASM_EDSP) +void oc_frag_copy_list_edsp(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); + +# if defined(OC_ARM_ASM_MEDIA) +void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride, + const ogg_int16_t *_residue); +void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src, + int _ystride,const ogg_int16_t *_residue); +void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); +void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc); +void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); +void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_loop_filter_init_v6(signed char *_bv,int _flimit); +void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data, + int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0, + ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, + const ptrdiff_t *_frag_buf_offs,int _nhfrags); + +# if defined(OC_ARM_ASM_NEON) +void oc_frag_copy_list_neon(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); +void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride, + const ogg_int16_t *_residue); +void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src, + int _ystride,const ogg_int16_t *_residue); +void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); +void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc); +void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); +void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_loop_filter_init_neon(signed char *_bv,int _flimit); +void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data, + int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0, + ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot, + const ptrdiff_t *_frag_buf_offs,int _nhfrags); +# endif +# endif +# endif +# endif + +#endif diff --git a/media/libtheora/lib/arm/armloop.s b/media/libtheora/lib/arm/armloop.s new file mode 100644 index 000000000..0a1d4705e --- /dev/null +++ b/media/libtheora/lib/arm/armloop.s @@ -0,0 +1,682 @@ +;******************************************************************** +;* * +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * +;* * +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * +;* * +;******************************************************************** +; Original implementation: +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $ +;******************************************************************** + + AREA |.text|, CODE, READONLY + + ; Explicitly specifying alignment here because some versions of + ; gas don't align code correctly. See + ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html + ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 + ALIGN + + GET armopts.s + + EXPORT oc_loop_filter_frag_rows_arm + +; Which bit this is depends on the order of packing within a bitfield. +; Hopefully that doesn't change among any of the relevant compilers. +OC_FRAG_CODED_FLAG * 1 + + ; Vanilla ARM v4 version +loop_filter_h_arm PROC + ; r0 = unsigned char *_pix + ; r1 = int _ystride + ; r2 = int *_bv + ; preserves r0-r3 + STMFD r13!,{r3-r6,r14} + MOV r14,#8 + MOV r6, #255 +lfh_arm_lp + LDRB r3, [r0, #-2] ; r3 = _pix[0] + LDRB r12,[r0, #1] ; r12= _pix[3] + LDRB r4, [r0, #-1] ; r4 = _pix[1] + LDRB r5, [r0] ; r5 = _pix[2] + SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 + ADD r3, r3, #4 + SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] + ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) + ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 + MOV r12,r12,ASR #3 + LDRSB r12,[r2, r12] + ; Stall (2 on Xscale) + ADDS r4, r4, r12 + CMPGT r6, r4 + EORLT r4, r6, r4, ASR #32 + SUBS r5, r5, r12 + CMPGT r6, r5 + EORLT r5, r6, r5, ASR #32 + STRB r4, [r0, #-1] + STRB r5, [r0], r1 + SUBS r14,r14,#1 + BGT lfh_arm_lp + SUB r0, r0, r1, LSL #3 + LDMFD r13!,{r3-r6,PC} + ENDP + +loop_filter_v_arm PROC + ; r0 = unsigned char *_pix + ; r1 = int _ystride + ; r2 = int *_bv + ; preserves r0-r3 + STMFD r13!,{r3-r6,r14} + MOV r14,#8 + MOV r6, #255 +lfv_arm_lp + LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0] + LDRB r12,[r0, r1] ; r12= _pix[3] + LDRB r4, [r0, -r1] ; r4 = _pix[1] + LDRB r5, [r0] ; r5 = _pix[2] + SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 + ADD r3, r3, #4 + SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] + ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) + ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 + MOV r12,r12,ASR #3 + LDRSB r12,[r2, r12] + ; Stall (2 on Xscale) + ADDS r4, r4, r12 + CMPGT r6, r4 + EORLT r4, r6, r4, ASR #32 + SUBS r5, r5, r12 + CMPGT r6, r5 + EORLT r5, r6, r5, ASR #32 + STRB r4, [r0, -r1] + STRB r5, [r0], #1 + SUBS r14,r14,#1 + BGT lfv_arm_lp + SUB r0, r0, #8 + LDMFD r13!,{r3-r6,PC} + ENDP + +oc_loop_filter_frag_rows_arm PROC + ; r0 = _ref_frame_data + ; r1 = _ystride + ; r2 = _bv + ; r3 = _frags + ; r4 = _fragi0 + ; r5 = _fragi0_end + ; r6 = _fragi_top + ; r7 = _fragi_bot + ; r8 = _frag_buf_offs + ; r9 = _nhfrags + MOV r12,r13 + STMFD r13!,{r0,r4-r11,r14} + LDMFD r12,{r4-r9} + ADD r2, r2, #127 ; _bv += 127 + CMP r4, r5 ; if(_fragi0>=_fragi0_end) + BGE oslffri_arm_end ; bail + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) + BLE oslffri_arm_end ; bail + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; +oslffri_arm_lp1 + MOV r10,r4 ; r10= fragi = _fragi0 + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 +oslffri_arm_lp2 + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ + LDR r0, [r13] ; r0 = _ref_frame_data + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ + TST r14,#OC_FRAG_CODED_FLAG + BEQ oslffri_arm_uncoded + CMP r10,r4 ; if (fragi>_fragi0) + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] + BLGT loop_filter_h_arm + CMP r4, r6 ; if (_fragi0>_fragi_top) + BLGT loop_filter_v_arm + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) + LDRLT r12,[r3] ; r12 = _frags[fragi+1] + ADD r0, r0, #8 + ADD r10,r10,#1 ; r10 = fragi+1; + ANDLT r12,r12,#OC_FRAG_CODED_FLAG + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 + BLLT loop_filter_h_arm + CMP r10,r7 ; if (fragi<_fragi_bot) + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] + SUB r0, r0, #8 + ADD r0, r0, r1, LSL #3 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG + CMPLT r12,#OC_FRAG_CODED_FLAG + BLLT loop_filter_v_arm + CMP r10,r11 ; while(fragi<=fragi_end-1) + BLE oslffri_arm_lp2 + MOV r4, r10 ; r4 = fragi0 += _nhfrags + CMP r4, r5 + BLT oslffri_arm_lp1 +oslffri_arm_end + LDMFD r13!,{r0,r4-r11,PC} +oslffri_arm_uncoded + ADD r10,r10,#1 + CMP r10,r11 + BLE oslffri_arm_lp2 + MOV r4, r10 ; r4 = _fragi0 += _nhfrags + CMP r4, r5 + BLT oslffri_arm_lp1 + LDMFD r13!,{r0,r4-r11,PC} + ENDP + + [ OC_ARM_ASM_MEDIA + EXPORT oc_loop_filter_init_v6 + EXPORT oc_loop_filter_frag_rows_v6 + +oc_loop_filter_init_v6 PROC + ; r0 = _bv + ; r1 = _flimit (=L from the spec) + MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L> + AND r1, r1, #255 ; r1 = ll=r1&0xFF + ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll> + PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll> + STR r1, [r0] + MOV PC,r14 + ENDP + +; We could use the same strategy as the v filter below, but that would require +; 40 instructions to load the data and transpose it into columns and another +; 32 to write out the results at the end, plus the 52 instructions to do the +; filtering itself. +; This is slightly less, and less code, even assuming we could have shared the +; 52 instructions in the middle with the other function. +; It executes slightly fewer instructions than the ARMv6 approach David Conrad +; proposed for FFmpeg, but not by much: +; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html +; His is a lot less code, though, because it only does two rows at once instead +; of four. +loop_filter_h_v6 PROC + ; r0 = unsigned char *_pix + ; r1 = int _ystride + ; r2 = int _ll + ; preserves r0-r3 + STMFD r13!,{r4-r11,r14} + LDR r12,=0x10003 + BL loop_filter_h_core_v6 + ADD r0, r0, r1, LSL #2 + BL loop_filter_h_core_v6 + SUB r0, r0, r1, LSL #2 + LDMFD r13!,{r4-r11,PC} + ENDP + +loop_filter_h_core_v6 PROC + ; r0 = unsigned char *_pix + ; r1 = int _ystride + ; r2 = int _ll + ; r12= 0x10003 + ; Preserves r0-r3, r12; Clobbers r4-r11. + LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0> + ; Single issue + LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0> + UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2> + UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1> + UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2> + UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1> + PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1> + PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2> + SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2> + SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3> + SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2> + SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4> + LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0> + MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3> + LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0> + PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p> + UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2> + UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p> + UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1> + UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2> + PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2> + SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2> + UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1> + SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3> + SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2> + SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4> + ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2> + MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3> + PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1> + PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r> + ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1> + UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r> + MOV r10,#0 + ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p> + ; Single issue + ; There's no min, max or abs instruction. + ; SSUB8 and SEL will work for abs, and we can do all the rest with + ; unsigned saturated adds, which means the GE flags are still all + ; set when we're done computing lflim(abs(R_i),L). + ; This allows us to both add and subtract, and split the results by + ; the original sign of R_i. + SSUB8 r7, r10,r6 + ; Single issue + SEL r7, r7, r6 ; r7 = abs(R_i) + ; Single issue + UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0) + ; Single issue + UQADD8 r7, r7, r4 + ; Single issue + UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0)) + ; Single issue + UQSUB8 r4, r8, r7 + UQADD8 r5, r9, r7 + UQADD8 r8, r8, r7 + UQSUB8 r9, r9, r7 + SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L) + SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L) + MOV r5, r9, LSR #24 ; r5 = s2 + STRB r5, [r0,#2]! + MOV r4, r8, LSR #24 ; r4 = s1 + STRB r4, [r0,#-1] + MOV r5, r9, LSR #8 ; r5 = r2 + STRB r5, [r0,-r1]! + MOV r4, r8, LSR #8 ; r4 = r1 + STRB r4, [r0,#-1] + MOV r5, r9, LSR #16 ; r5 = q2 + STRB r5, [r0,-r1]! + MOV r4, r8, LSR #16 ; r4 = q1 + STRB r4, [r0,#-1] + ; Single issue + STRB r9, [r0,-r1]! + ; Single issue + STRB r8, [r0,#-1] + MOV PC,r14 + ENDP + +; This uses the same strategy as the MMXEXT version for x86, except that UHADD8 +; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB. +; This works just as well, with the following procedure for computing the +; filter value, f: +; u = ~UHADD8(p1,~p2); +; v = UHADD8(~p1,p2); +; m = v-u; +; a = m^UHADD8(m^p0,m^~p3); +; f = UHADD8(UHADD8(a,u1),v1); +; where f = 127+R, with R in [-127,128] defined as in the spec. +; This is exactly the same amount of arithmetic as the version that uses PAVGB +; as the basic operator. +; It executes about 2/3 the number of instructions of David Conrad's approach, +; but requires more code, because it does all eight columns at once, instead +; of four at a time. +loop_filter_v_v6 PROC + ; r0 = unsigned char *_pix + ; r1 = int _ystride + ; r2 = int _ll + ; preserves r0-r11 + STMFD r13!,{r4-r11,r14} + LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1> + LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0> + LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2> + MVN r14,r6 ; r14= ~p1 + LDRD r10,[r0, r1] ; r11,r10= <p7|p3> + ; Filter the first four columns. + MVN r12,r8 ; r12= ~p2 + UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1 + UHADD8 r12,r12,r6 ; r12= p1+~p2>>1 + MVN r10, r10 ; r10=~p3 + MVN r12,r12 ; r12= u1=~p1+p2+1>>1 + SSUB8 r14,r14,r12 ; r14= m1=v1-u1 + ; Single issue + EOR r4, r4, r14 ; r4 = m1^p0 + EOR r10,r10,r14 ; r10= m1^~p3 + UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1 + ; Single issue + EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1) + SADD8 r14,r14,r12 ; r14= v1=m1+u1 + UHADD8 r4, r4, r12 ; r4 = a1+u1>>1 + MVN r12,r9 ; r12= ~p6 + UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1 + ; Filter the second four columns. + MVN r14,r7 ; r14= ~p5 + UHADD8 r12,r12,r7 ; r12= p5+~p6>>1 + UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1 + MVN r12,r12 ; r12= u2=~p5+p6+1>>1 + MVN r11,r11 ; r11=~p7 + SSUB8 r10,r14,r12 ; r10= m2=v2-u2 + ; Single issue + EOR r5, r5, r10 ; r5 = m2^p4 + EOR r11,r11,r10 ; r11= m2^~p7 + UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1 + ; Single issue + EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1) + ; Single issue + UHADD8 r5, r5, r12 ; r5 = a2+u2>>1 + LDR r12,=0x7F7F7F7F ; r12 = {127}x4 + UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1 + ; Now split f[i] by sign. + ; There's no min or max instruction. + ; We could use SSUB8 and SEL, but this is just as many instructions and + ; dual issues more (for v7 without NEON). + UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0 + UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0 + UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0) + UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0) + UQADD8 r10,r10,r11 + UQADD8 r4, r4, r14 + UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) + UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) + UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0 + UQADD8 r6, r6, r10 + UQSUB8 r8, r8, r10 + UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0 + UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L) + UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L) + UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0) + UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0) + UQADD8 r11,r11,r10 + UQADD8 r5, r5, r14 + UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) + UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) + UQADD8 r7, r7, r11 + UQSUB8 r9, r9, r11 + UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L) + STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6] + UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L) + STRD r8, [r0] ; [p6:p2] = [r9: r8] + LDMFD r13!,{r4-r11,PC} + ENDP + +oc_loop_filter_frag_rows_v6 PROC + ; r0 = _ref_frame_data + ; r1 = _ystride + ; r2 = _bv + ; r3 = _frags + ; r4 = _fragi0 + ; r5 = _fragi0_end + ; r6 = _fragi_top + ; r7 = _fragi_bot + ; r8 = _frag_buf_offs + ; r9 = _nhfrags + MOV r12,r13 + STMFD r13!,{r0,r4-r11,r14} + LDMFD r12,{r4-r9} + LDR r2, [r2] ; ll = *(int *)_bv + CMP r4, r5 ; if(_fragi0>=_fragi0_end) + BGE oslffri_v6_end ; bail + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) + BLE oslffri_v6_end ; bail + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; +oslffri_v6_lp1 + MOV r10,r4 ; r10= fragi = _fragi0 + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 +oslffri_v6_lp2 + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ + LDR r0, [r13] ; r0 = _ref_frame_data + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ + TST r14,#OC_FRAG_CODED_FLAG + BEQ oslffri_v6_uncoded + CMP r10,r4 ; if (fragi>_fragi0) + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] + BLGT loop_filter_h_v6 + CMP r4, r6 ; if (fragi0>_fragi_top) + BLGT loop_filter_v_v6 + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) + LDRLT r12,[r3] ; r12 = _frags[fragi+1] + ADD r0, r0, #8 + ADD r10,r10,#1 ; r10 = fragi+1; + ANDLT r12,r12,#OC_FRAG_CODED_FLAG + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 + BLLT loop_filter_h_v6 + CMP r10,r7 ; if (fragi<_fragi_bot) + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] + SUB r0, r0, #8 + ADD r0, r0, r1, LSL #3 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG + CMPLT r12,#OC_FRAG_CODED_FLAG + BLLT loop_filter_v_v6 + CMP r10,r11 ; while(fragi<=fragi_end-1) + BLE oslffri_v6_lp2 + MOV r4, r10 ; r4 = fragi0 += nhfrags + CMP r4, r5 + BLT oslffri_v6_lp1 +oslffri_v6_end + LDMFD r13!,{r0,r4-r11,PC} +oslffri_v6_uncoded + ADD r10,r10,#1 + CMP r10,r11 + BLE oslffri_v6_lp2 + MOV r4, r10 ; r4 = fragi0 += nhfrags + CMP r4, r5 + BLT oslffri_v6_lp1 + LDMFD r13!,{r0,r4-r11,PC} + ENDP + ] + + [ OC_ARM_ASM_NEON + EXPORT oc_loop_filter_init_neon + EXPORT oc_loop_filter_frag_rows_neon + +oc_loop_filter_init_neon PROC + ; r0 = _bv + ; r1 = _flimit (=L from the spec) + MOV r1, r1, LSL #1 ; r1 = 2*L + VDUP.S16 Q15, r1 ; Q15= 2L in U16s + VST1.64 {D30,D31}, [r0@128] + MOV PC,r14 + ENDP + +loop_filter_h_neon PROC + ; r0 = unsigned char *_pix + ; r1 = int _ystride + ; r2 = int *_bv + ; preserves r0-r3 + ; We assume Q15= 2*L in U16s + ; My best guesses at cycle counts (and latency)--vvv + SUB r12,r0, #2 + ; Doing a 2-element structure load saves doing two VTRN's below, at the + ; cost of using two more slower single-lane loads vs. the faster + ; all-lane loads. + ; It's less code this way, though, and benches a hair faster, but it + ; leaves D2 and D4 swapped. + VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1 + ; D2 = ____________3322 + VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1 + ; D6 = ____________7766 + VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1 + ; D2 = ________BBAA3322 + VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1 + ; D6 = ________FFEE7766 + VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1 + ; D2 = ____JJIIBBAA3322 + VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1 + ; D6 = ____NNMMFFEE7766 + VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1 + ; D2 = RRQQJJIIBBAA3322 + VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1 + ; D6 = VVUUNNMMFFEE7766 + VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1 + VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1 + VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 + VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3 + ADD r12,r0, #8 + VADD.S16 Q0, Q0, Q8 ; 1,3 + PLD [r12] + VADD.S16 Q0, Q0, Q8 ; 1,3 + PLD [r12,r1] + VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 + PLD [r12,r1, LSL #1] + VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 + ADD r12,r12,r1, LSL #2 + ; We want to do + ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) + ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) + ; So we've reduced the left and right hand terms to be the same, except + ; for a negation. + ; Stall x3 + VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 + PLD [r12,-r1] + VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 + PLD [r12] + VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 + PLD [r12,r1] + VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 + PLD [r12,r1,LSL #1] + VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 + ADD r12,r12,r1, LSL #2 + ; Now we need to correct for the sign of f. + ; For negative elements of Q0, we want to subtract the appropriate + ; element of Q9. For positive elements we want to add them. No NEON + ; instruction exists to do this, so we need to negate the negative + ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b + VADD.S16 Q9, Q9, Q0 ; 1,3 + PLD [r12,-r1] + VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 + ; Bah. No VRSBW.U8 + ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) + VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 + VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 + VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1 + VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1 + SUB r12,r0, #1 + VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1 + VST1.16 {D4[0]}, [r12], r1 + VST1.16 {D2[0]}, [r12], r1 + VST1.16 {D4[1]}, [r12], r1 + VST1.16 {D2[1]}, [r12], r1 + VST1.16 {D4[2]}, [r12], r1 + VST1.16 {D2[2]}, [r12], r1 + VST1.16 {D4[3]}, [r12], r1 + VST1.16 {D2[3]}, [r12], r1 + MOV PC,r14 + ENDP + +loop_filter_v_neon PROC + ; r0 = unsigned char *_pix + ; r1 = int _ystride + ; r2 = int *_bv + ; preserves r0-r3 + ; We assume Q15= 2*L in U16s + ; My best guesses at cycle counts (and latency)--vvv + SUB r12,r0, r1, LSL #1 + VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1 + VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1 + VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1 + VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1 + VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3 + VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 + ADD r12, #8 + VADD.S16 Q0, Q0, Q8 ; 1,3 + PLD [r12] + VADD.S16 Q0, Q0, Q8 ; 1,3 + PLD [r12,r1] + VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 + SUB r12, r0, r1 + VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 + ; We want to do + ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) + ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) + ; So we've reduced the left and right hand terms to be the same, except + ; for a negation. + ; Stall x3 + VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 + VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 + ; Stall x2 + VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 + VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 + ; Stall x2 + VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 + ; Now we need to correct for the sign of f. + ; For negative elements of Q0, we want to subtract the appropriate + ; element of Q9. For positive elements we want to add them. No NEON + ; instruction exists to do this, so we need to negate the negative + ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b + ; Stall x3 + VADD.S16 Q9, Q9, Q0 ; 1,3 + ; Stall x2 + VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 + ; Bah. No VRSBW.U8 + ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) + VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 + VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 + VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1 + VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1 + VST1.64 {D2}, [r12@64], r1 + VST1.64 {D4}, [r12@64], r1 + MOV PC,r14 + ENDP + +oc_loop_filter_frag_rows_neon PROC + ; r0 = _ref_frame_data + ; r1 = _ystride + ; r2 = _bv + ; r3 = _frags + ; r4 = _fragi0 + ; r5 = _fragi0_end + ; r6 = _fragi_top + ; r7 = _fragi_bot + ; r8 = _frag_buf_offs + ; r9 = _nhfrags + MOV r12,r13 + STMFD r13!,{r0,r4-r11,r14} + LDMFD r12,{r4-r9} + CMP r4, r5 ; if(_fragi0>=_fragi0_end) + BGE oslffri_neon_end; bail + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) + BLE oslffri_neon_end ; bail + VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; +oslffri_neon_lp1 + MOV r10,r4 ; r10= fragi = _fragi0 + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 +oslffri_neon_lp2 + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ + LDR r0, [r13] ; r0 = _ref_frame_data + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ + TST r14,#OC_FRAG_CODED_FLAG + BEQ oslffri_neon_uncoded + CMP r10,r4 ; if (fragi>_fragi0) + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] + BLGT loop_filter_h_neon + CMP r4, r6 ; if (_fragi0>_fragi_top) + BLGT loop_filter_v_neon + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) + LDRLT r12,[r3] ; r12 = _frags[fragi+1] + ADD r0, r0, #8 + ADD r10,r10,#1 ; r10 = fragi+1; + ANDLT r12,r12,#OC_FRAG_CODED_FLAG + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 + BLLT loop_filter_h_neon + CMP r10,r7 ; if (fragi<_fragi_bot) + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] + SUB r0, r0, #8 + ADD r0, r0, r1, LSL #3 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG + CMPLT r12,#OC_FRAG_CODED_FLAG + BLLT loop_filter_v_neon + CMP r10,r11 ; while(fragi<=fragi_end-1) + BLE oslffri_neon_lp2 + MOV r4, r10 ; r4 = _fragi0 += _nhfrags + CMP r4, r5 + BLT oslffri_neon_lp1 +oslffri_neon_end + LDMFD r13!,{r0,r4-r11,PC} +oslffri_neon_uncoded + ADD r10,r10,#1 + CMP r10,r11 + BLE oslffri_neon_lp2 + MOV r4, r10 ; r4 = _fragi0 += _nhfrags + CMP r4, r5 + BLT oslffri_neon_lp1 + LDMFD r13!,{r0,r4-r11,PC} + ENDP + ] + + END diff --git a/media/libtheora/lib/arm/armopts.s b/media/libtheora/lib/arm/armopts.s new file mode 100644 index 000000000..e4da429e4 --- /dev/null +++ b/media/libtheora/lib/arm/armopts.s @@ -0,0 +1,39 @@ +;******************************************************************** +;* * +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * +;* * +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * +;* * +;******************************************************************** +; Original implementation: +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $ +;******************************************************************** + +; Set the following to 1 if we have EDSP instructions +; (LDRD/STRD, etc., ARMv5E and later). +OC_ARM_ASM_EDSP * 1 + +; Set the following to 1 if we have ARMv6 media instructions. +OC_ARM_ASM_MEDIA * 1 + +; Set the following to 1 if we have NEON (some ARMv7) +OC_ARM_ASM_NEON * 1 + +; Set the following to 1 if LDR/STR can work on unaligned addresses +; This is assumed to be true for ARMv6 and later code +OC_ARM_CAN_UNALIGN * 0 + +; Large unaligned loads and stores are often configured to cause an exception. +; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store) +; boundary, so it's usually a bad idea to use them anyway if they can be +; avoided. + +; Set the following to 1 if LDRD/STRD can work on unaligned addresses +OC_ARM_CAN_UNALIGN_LDRD * 0 + + END diff --git a/media/libtheora/lib/arm/armstate.c b/media/libtheora/lib/arm/armstate.c new file mode 100644 index 000000000..a56060838 --- /dev/null +++ b/media/libtheora/lib/arm/armstate.c @@ -0,0 +1,219 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $ + + ********************************************************************/ +#include "armint.h" + +#if defined(OC_ARM_ASM) + +# if defined(OC_ARM_ASM_NEON) +/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into + the destination.*/ +static const unsigned char OC_FZIG_ZAG_NEON[128]={ + 0, 8, 1, 2, 9,16,24,17, + 10, 3, 4,11,18,25,32,40, + 33,26,19,12, 5, 6,13,20, + 27,34,41,48,56,49,42,35, + 28,21,14, 7,15,22,29,36, + 43,50,57,58,51,44,37,30, + 23,31,38,45,52,59,60,53, + 46,39,47,54,61,62,55,63, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64 +}; +# endif + +void oc_state_accel_init_arm(oc_theora_state *_state){ + oc_state_accel_init_c(_state); + _state->cpu_flags=oc_cpu_flags_get(); +# if defined(OC_STATE_USE_VTABLE) + _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm; + _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm; + _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm; + _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm; + _state->opt_vtable.idct8x8=oc_idct8x8_arm; + _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm; + /*Note: We _must_ set this function pointer, because the macro in armint.h + calls it with different arguments, so the C version will segfault.*/ + _state->opt_vtable.state_loop_filter_frag_rows= + (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm; +# endif +# if defined(OC_ARM_ASM_EDSP) + if(_state->cpu_flags&OC_CPU_ARM_EDSP){ +# if defined(OC_STATE_USE_VTABLE) + _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp; +# endif + } +# if defined(OC_ARM_ASM_MEDIA) + if(_state->cpu_flags&OC_CPU_ARM_MEDIA){ +# if defined(OC_STATE_USE_VTABLE) + _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6; + _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6; + _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6; + _state->opt_vtable.idct8x8=oc_idct8x8_v6; + _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6; + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6; + _state->opt_vtable.state_loop_filter_frag_rows= + (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6; +# endif + } +# if defined(OC_ARM_ASM_NEON) + if(_state->cpu_flags&OC_CPU_ARM_NEON){ +# if defined(OC_STATE_USE_VTABLE) + _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon; + _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon; + _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon; + _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon; + _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon; + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon; + _state->opt_vtable.state_loop_filter_frag_rows= + (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon; + _state->opt_vtable.idct8x8=oc_idct8x8_neon; +# endif + _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON; + } +# endif +# endif +# endif +} + +void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ + unsigned char *dst; + ptrdiff_t frag_buf_off; + int ystride; + int refi; + /*Apply the inverse transform.*/ + /*Special case only having a DC component.*/ + if(_last_zzi<2){ + ogg_uint16_t p; + /*We round this dequant product (and not any of the others) because there's + no iDCT rounding.*/ + p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); + oc_idct8x8_1_arm(_dct_coeffs+64,p); + } + else{ + /*First, dequantize the DC coefficient.*/ + _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); + oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi); + } + /*Fill in the target buffer.*/ + frag_buf_off=_state->frag_buf_offs[_fragi]; + refi=_state->frags[_fragi].refi; + ystride=_state->ref_ystride[_pli]; + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; + if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64); + else{ + const unsigned char *ref; + int mvoffsets[2]; + ref=_state->ref_frame_data[refi]+frag_buf_off; + if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, + _state->frag_mvs[_fragi])>1){ + oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, + _dct_coeffs+64); + } + else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); + } +} + +# if defined(OC_ARM_ASM_MEDIA) +void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ + unsigned char *dst; + ptrdiff_t frag_buf_off; + int ystride; + int refi; + /*Apply the inverse transform.*/ + /*Special case only having a DC component.*/ + if(_last_zzi<2){ + ogg_uint16_t p; + /*We round this dequant product (and not any of the others) because there's + no iDCT rounding.*/ + p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); + oc_idct8x8_1_v6(_dct_coeffs+64,p); + } + else{ + /*First, dequantize the DC coefficient.*/ + _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); + oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi); + } + /*Fill in the target buffer.*/ + frag_buf_off=_state->frag_buf_offs[_fragi]; + refi=_state->frags[_fragi].refi; + ystride=_state->ref_ystride[_pli]; + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; + if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64); + else{ + const unsigned char *ref; + int mvoffsets[2]; + ref=_state->ref_frame_data[refi]+frag_buf_off; + if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, + _state->frag_mvs[_fragi])>1){ + oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, + _dct_coeffs+64); + } + else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); + } +} + +# if defined(OC_ARM_ASM_NEON) +void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ + unsigned char *dst; + ptrdiff_t frag_buf_off; + int ystride; + int refi; + /*Apply the inverse transform.*/ + /*Special case only having a DC component.*/ + if(_last_zzi<2){ + ogg_uint16_t p; + /*We round this dequant product (and not any of the others) because there's + no iDCT rounding.*/ + p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); + oc_idct8x8_1_neon(_dct_coeffs+64,p); + } + else{ + /*First, dequantize the DC coefficient.*/ + _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); + oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi); + } + /*Fill in the target buffer.*/ + frag_buf_off=_state->frag_buf_offs[_fragi]; + refi=_state->frags[_fragi].refi; + ystride=_state->ref_ystride[_pli]; + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; + if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64); + else{ + const unsigned char *ref; + int mvoffsets[2]; + ref=_state->ref_frame_data[refi]+frag_buf_off; + if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, + _state->frag_mvs[_fragi])>1){ + oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, + _dct_coeffs+64); + } + else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); + } +} +# endif +# endif + +#endif |