[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[OpenDivX] OpenDivX Decoder ->OpenDivX Forums Digest Part 2 Lots of Code
OpenDivX Forums Digest Part 2 Lots of Code
Topic: Faster code
Author: mcmab
Posted: 2001-02-22 12:40
------------------------------------------------------------------------
--------
I have actually recoded all of basic_prediction.c (the MMX version) and
I won't bore you all with it. However this routine demonstrates just how
much these routines can be improved.
In:
void CopyMBlockHorVer(unsigned char * Src, unsigned char * Dst, int
Stride)
CODE WAS:
/* OLD ASSEMBLER COMMENTED OUT
xor eax, eax
mov ebx, Stride
xor ecx, ecx
mov edx, 16
mov esi, dword ptr [Src]
mov edi, dword ptr [Dst]
start_again1:
// 0
//xor eax, eax
mov al, [esi]
mov cl, [esi+1]
add eax, ecx
mov cl, [esi+ebx]
add eax, ecx
mov cl, [esi+ebx+1]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi], al
// 1
//xor eax, eax
mov al, [esi+1]
mov cl, [esi+2]
add eax, ecx
mov cl, [esi+ebx+1]
add eax, ecx
mov cl, [esi+ebx+2]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+1], al
// 2
mov al, [esi+2]
mov cl, [esi+3]
add eax, ecx
mov cl, [esi+ebx+2]
add eax, ecx
mov cl, [esi+ebx+3]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+2], al
// 3
mov al, [esi+3]
mov cl, [esi+4]
add eax, ecx
mov cl, [esi+ebx+3]
add eax, ecx
mov cl, [esi+ebx+4]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+3], al
// 4
//xor eax, eax
mov al, [esi+4]
mov cl, [esi+5]
add eax, ecx
mov cl, [esi+ebx+4]
add eax, ecx
mov cl, [esi+ebx+5]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+4], al
// 5
//xor eax, eax
mov al, [esi+5]
mov cl, [esi+6]
add eax, ecx
mov cl, [esi+ebx+5]
add eax, ecx
mov cl, [esi+ebx+6]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+5], al
// 6
mov al, [esi+6]
mov cl, [esi+7]
add eax, ecx
mov cl, [esi+ebx+6]
add eax, ecx
mov cl, [esi+ebx+7]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+6], al
// 7
mov al, [esi+7]
mov cl, [esi+8]
add eax, ecx
mov cl, [esi+ebx+7]
add eax, ecx
mov cl, [esi+ebx+8]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+7], al
// 8
mov al, [esi+8]
mov cl, [esi+9]
add eax, ecx
mov cl, [esi+ebx+8]
add eax, ecx
mov cl, [esi+ebx+9]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+8], al
// 9
mov al, [esi+9]
mov cl, [esi+10]
add eax, ecx
mov cl, [esi+ebx+9]
add eax, ecx
mov cl, [esi+ebx+10]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+9], al
// 10
mov al, [esi+10]
mov cl, [esi+11]
add eax, ecx
mov cl, [esi+ebx+10]
add eax, ecx
mov cl, [esi+ebx+11]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+10], al
// 11
mov al, [esi+11]
mov cl, [esi+12]
add eax, ecx
mov cl, [esi+ebx+11]
add eax, ecx
mov cl, [esi+ebx+12]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+11], al
//12
mov al, [esi+12]
mov cl, [esi+13]
add eax, ecx
mov cl, [esi+ebx+12]
add eax, ecx
mov cl, [esi+ebx+13]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+12], al
//13
//xor eax, eax
mov al, [esi+13]
mov cl, [esi+14]
add eax, ecx
mov cl, [esi+ebx+13]
add eax, ecx
mov cl, [esi+ebx+14]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+13], al
//14
mov al, [esi+14]
mov cl, [esi+15]
add eax, ecx
mov cl, [esi+ebx+14]
add eax, ecx
mov cl, [esi+ebx+15]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+14], al
//15
mov al, [esi+15]
mov cl, [esi+16]
add eax, ecx
mov cl, [esi+ebx+15]
add eax, ecx
mov cl, [esi+ebx+16]
add eax, ecx
add eax, 2
shr eax, 2
mov [edi+15], al
add esi, ebx
add edi, ebx
dec edx
jnz start_again1
*/ END OF OLD ASSEMBLER COMMENTED OUT
Note that the preceeding code uses NO MMX even though its in the MMX
directory.
NEW CODE:
I include this as it contains some MMX tricks others might find useful:
// Although this routine is quite lengthy and involves a lot of reg->reg
moves it does have the enormous advantage of minimizing memory reads
// and maintains values across iterations of the loop. The original does
16x4 reads = 64 reads per iteration. This does *four*! For 16 iters, its
64 vs 1024.
// The approach is to take the original 16-byte/iter loop and do it in
two 8-byte mmx chunks, eax and eax+8, for each iteration.
// Since we have to do totals in 16-bits we have to accumulate in two
mmx regs. I'll refer to them as the accumulators.
// Note the pairs of instructions, typically
// punpcklbw mmx, mm7
// punpckhbw mmx, mm7
// mm7 is zero throughout so these pair just promote bytes-> words with
zero extension
mov eax, dword ptr [Src]
mov ebx, Stride
mov edx, dword ptr [Dst]
mov ecx,16 ; counter for
*dy* loop
pxor mm7, mm7 ; zero register
movq mm6, qword ptr [eax]
movq mm5, qword ptr [eax+1]
movq mm4, qword ptr [eax+8]
movq mm3, qword ptr [eax+9]
// On entry to loop a number of registers are defined
// mm7 = 0
// mm6 = [eax]
// mm5 = [eax+1]
// mm4 = [eax+8]
// mm3 = [eax+9]
loop_CMBHVR:
//*********First the set at eax
movq mm1, mm6 ; 8 bytes of
source code
movq mm0, mm6 ; these
instructions run parallel.
punpcklbw mm0,mm7 ; unsigned byte -> word,
mm0 & mm1 will be accumulators
punpckhbw mm1,mm7 ; hi bytes
movq mm6, mm5 ; and repeat for
Src[dx+1]
punpcklbw mm5,mm7 ; unsigned byte -> word,
mm0 & mm1 will be accumulators
punpckhbw mm6,mm7 ; hi bytes
paddusw mm0,mm5 ; accumulate the totals:
eax+(eax+1)+(eax+ebx)+(eax+ebx+1), so far only eax and eax+1. This is lo
byte.
paddusw mm1,mm6 ; 2, 5 & 6 now free,
accumulating high 4 bytes
//********* Now for the second set at eax+8
movq mm6, mm3
punpcklbw mm3,mm7 ; unsigned byte -> word,
mm0 & mm1 will be accumulators
punpckhbw mm6,mm7 ; hi bytes
movq mm5, mm4 ; mm5 is now
free so take advantage of (para)llelism
punpcklbw mm4,mm7 ; unsigned byte -> word,
mm0 & mm1 will be accumulators
punpckhbw mm5,mm7 ; hi bytes
paddusw mm3,mm4
paddusw mm5,mm6 ; 2, 4 & 6 now free, mm3 & mm5
are 2nd set of accumulators
add eax,ebx ; now eax->dx+Stride
//********* Doing the first set again
movq mm6, qword ptr [eax] ; mm6 has value
for next loop iteration (read as eax+ebx)
movq mm2, mm6 ; we need to
keep mm6 untouched for next loop iteration
movq mm4, mm6
punpcklbw mm2,mm7 ; unsigned byte -> word,
mm0 & mm1 will be accumulators
punpckhbw mm4,mm7 ; hi bytes
paddusw mm0,mm2
paddusw mm1,mm4 ; 2 & 4 are unused
movq mm4, qword ptr [eax+1] ; mm5 will have value
for next iteration, but has accum now (read as eax+ebx+1)
movq mm2, mm4
punpcklbw mm2, mm7 ; unsigned byte
-> word, mm0 & mm1 will be accumulators
paddusw mm0, mm2
movq mm2, mm4
punpckhbw mm2, mm7 ; hi bytes
paddusw mm1, mm2 ; OK now we have the
totals in mm0 & mm1, we must add 2 to them. 2 & 4 scratch
pcmpeqd mm2, mm2 ; set all bits -> 1 (its
a trick)
psubsw mm0, mm2 ;+1
psubsw mm1, mm2 ;+1
psubsw mm0, mm2 ;+2
psubsw mm1, mm2 ;+2
psrlw mm0,2
psrlw mm1,2
packuswb mm0,mm1 ; packed result 1 is
free
movq qword ptr [edx],mm0 ; write the
result, 0, 1 are free
movq mm1,mm5 ; register twiddling,
moving the second hi accumulator
movq mm5,mm4 ; restore eax+1 to its
rightful position
//********** Now the second set at eax+8
movq mm4, qword ptr [eax+8] ; mm4 has value for next
iteration (read as eax+ebx+
movq mm0, mm4
punpckhbw mm0, mm7 ; unsigned byte
-> word, mm1 & mm3 are be accumulators
paddusw mm1, mm0 ; and add it to the
accumulator
movq mm0, mm4 ; restore the
[eax+8] value
punpcklbw mm0, mm7 ; lo bytes
paddusw mm0, mm3 ; now 4,5,6 are values
for next iteration, 1&0 are accumulators, 7=0, 2 is +1, note trick
reversal of regs
; lets add 2 now
while we have the value in mm2
movq mm3, qword ptr [eax+9] ; last value needed in
iteration (actually its eax+ebx+9), lets get this loading before its
needed.
psubsw mm0, mm2 ;+1
psubsw mm1, mm2 ;+1
psubsw mm0, mm2 ;+2
psubsw mm1, mm2 ;+2
movq mm2, mm3 ; register
twiddling, mm2 is now unused
punpcklbw mm2, mm7 ; convert bytes
to words, zero extend
paddusw mm0, mm2 ; and add it to the
accumulator
movq mm2, mm3 ; get back that
value
psrlw mm0,2 ; do the
algorithm's >>2
punpckhbw mm2, mm7 ; convert bytes
to words, zero extend
paddusw mm1, mm2 ; do the 16-bit addition
psrlw mm1,2 ; do the
algorithm's >>2
packuswb mm0, mm1 ; phew! register
pressure. Pack 8x16-bits to 8x8 bits.
movq qword ptr [edx+8], mm0 ; write the result
add edx,ebx
dec ecx
jnz loop_CMBHVR
Topic: Faster code
Author: mcmab
Posted: 2001-02-22 12:34
------------------------------------------------------------------------
--------
OK, here's some optimizations for basic_prediction.asm (untested - still
waiting for VC++)
In:
void CopyBlockHor(unsigned char * Src, unsigned char * Dst, int Stride)
CODE WAS:
/* BEGIN OLD ASSEMBLER COMMENTED OUT
start_again10:
// 0
mov al, [esi]
inc esi
mov cl, [esi]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
inc edi
// 1
mov al, [esi]
inc esi
mov cl, [esi]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
inc edi
// 2
mov al, [esi]
inc esi
mov cl, [esi]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
inc edi
// 3
mov al, [esi]
inc esi
mov cl, [esi]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
inc edi
// 4
mov al, [esi]
inc esi
mov cl, [esi]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
inc edi
// 5
mov al, [esi]
inc esi
mov cl, [esi]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
inc edi
// 6
mov al, [esi]
inc esi
mov cl, [esi]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
inc edi
// 7
mov al, [esi]
mov cl, [esi+1]
add eax, ecx
inc eax
shr eax, 1
mov [edi], al
add esi, ebx
add edi, ebx
dec edx
jnz start_again10
*/ OLD ASSEMBLER COMMENTED OUT
NEW CODE, NO LOOP, USES 3DNow
// Note that ESI & EDI are NOT used and therefore must not be pushed &
popped in prolog/epilog
// Stride is coded_picture_width and is quite large (>64} defined again
in mp2_recon.c so no point in grouping writes
mov eax, dword ptr [Src] ;these
loads should be accessed off of ESP, not ebp
mov ebx, Stride ; no
need to push pop EBP either CHECK ASSEMBLER LISTING
mov edx, dword ptr [Dst]
movq mm0, qword ptr [eax]
movq mm1, qword ptr [eax+ebx] ;avoid
dependency on mm0 if pos.
pavgusb mm0,qword ptr [eax+1]
lea ecx,[ebx+ebx*2] ;ecx=3*stride
pavgusb mm1, qword ptr [eax+ebx+1] ;this is an 8x8
byte routine and to make things clear we'll use mm0-mm7
movq mm2, qword ptr [eax+ebx*2]
movq qword ptr [edx],mm0
movq mm3, qword ptr [eax+ecx]
pavgusb mm2, qword ptr [eax+ebx*2+1]
movq qword ptr [edx+ebx],mm1
pavgusb mm3, qword ptr [eax+ecx+1] ;well at least
pavgusb is pipelined
movq qword ptr [edx+ebx*2],mm2
lea eax,[eax+ebx*4] ;and repeat the
exercise
movq qword ptr [edx+ecx],mm3 ;32 bytes
written
lea edx,[edx+ebx*4]
movq mm4, qword ptr [eax]
movq mm5, qword ptr [eax+ebx]
pavgusb mm4, qword ptr [eax+1]
movq mm6, qword ptr [eax+ebx*2]
pavgusb mm5, qword ptr [eax+ebx+1]
movq qword ptr [edx],mm4 ;
movq mm7, qword ptr [eax+ecx] ;memo
ecx = 3*stride
movq qword ptr [edx+ebx],mm5
pavgusb mm6, qword ptr [eax+ebx*2+1]
pavgusb mm7, qword ptr [eax+ecx+1]
movq qword ptr [edx+ebx*2],mm6
movq qword ptr [edx+ecx],mm7 ;64 bytes
written
// Should put a femms somewhere, suggest higher up.
Topic: Faster code
Author: mcmab
Posted: 2001-02-22 13:03
------------------------------------------------------------------------
--------
In preceeding post the COMMENTS here:
// On entry to loop a number of registers are defined
// mm7 = 0
// mm6 = [eax]
// mm5 = [eax+1]
// mm4 = [eax+8]
// mm3 = [eax+9]
are wrong, copied from another routine. It should say mm4 =
[eax+Stride], mm3 = [eax+Stride+1]
Sorry 'bout that.
Topic: Faster code
Author: mcmab
Posted: 2001-02-22 13:08
------------------------------------------------------------------------
--------
Aargh, screw up. Ignore last post about comments - they were right.
Topic: How to feed DECORE.... I am lost!
Author: e7abe7a
Posted: 2001-02-22 18:08
------------------------------------------------------------------------
--------
There's a new variable in the DEC_PARAM structure. This field has been
added to support more output format (different YUV modes) and not only
the RGB.
You shouldn't change anything in your application: the previous API call
will continue to work.
The problem you mentioned... I try to guess... could be due to the YUV
to RGB conversion routine. Which color depth are you using?[ This
message was edited by: e7abe7a on 2001-02-22 18:10 ]
Topic: Bug in HiColor
Author: e7abe7a
Posted: 2001-02-22 18:14
------------------------------------------------------------------------
--------
There are now two different 16 bit conversion routines. One is using 555
(first red bit bit zero), the other one is using 565 (first green bit
zero).
Try the new release with your graphic card (filter and dll) and give us
a feedback.
PS.: what graphic card are you using?
Topic: Faster code
Author: eagle
Posted: 2001-02-22 18:23
------------------------------------------------------------------------
--------
Wow, that's a lot of good stuff! What sort of speed improvement do
these changes get us? Prediction is quite a CPU-hungry routine so a
good improvement here would be a good improvement overall.
When we know which is the fastest, we'll drop it into the decore tree.
eagle
_______________________________________________
OpenDivX mailing list
OpenDivX@lists.projectmayo.com
http://lists.projectmayo.com/mailman/listinfo/opendivx
Reply To Poster
Local References / HOW-TO / FAQs