diff options
Diffstat (limited to 'other-licenses/7zstub/src/Asm')
-rw-r--r-- | other-licenses/7zstub/src/Asm/arm/7zCrcOpt.asm | 100 | ||||
-rw-r--r-- | other-licenses/7zstub/src/Asm/x86/7zAsm.asm | 147 | ||||
-rw-r--r-- | other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm | 147 | ||||
-rw-r--r-- | other-licenses/7zstub/src/Asm/x86/AesOpt.asm | 237 | ||||
-rw-r--r-- | other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm | 1258 | ||||
-rw-r--r-- | other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm | 205 |
6 files changed, 2094 insertions, 0 deletions
diff --git a/other-licenses/7zstub/src/Asm/arm/7zCrcOpt.asm b/other-licenses/7zstub/src/Asm/arm/7zCrcOpt.asm new file mode 100644 index 000000000..f008d658c --- /dev/null +++ b/other-licenses/7zstub/src/Asm/arm/7zCrcOpt.asm @@ -0,0 +1,100 @@ + CODE32
+
+ EXPORT |CrcUpdateT4@16|
+
+ AREA |.text|, CODE, ARM
+
+ MACRO
+ CRC32_STEP_1
+
+ ldrb r4, [r1], #1
+ subs r2, r2, #1
+ eor r4, r4, r0
+ and r4, r4, #0xFF
+ ldr r4, [r3, +r4, lsl #2]
+ eor r0, r4, r0, lsr #8
+
+ MEND
+
+
+ MACRO
+ CRC32_STEP_4 $STREAM_WORD
+
+ eor r7, r7, r8
+ eor r7, r7, r9
+ eor r0, r0, r7
+ eor r0, r0, $STREAM_WORD
+ ldr $STREAM_WORD, [r1], #4
+
+ and r7, r0, #0xFF
+ and r8, r0, #0xFF00
+ and r9, r0, #0xFF0000
+ and r0, r0, #0xFF000000
+
+ ldr r7, [r6, +r7, lsl #2]
+ ldr r8, [r5, +r8, lsr #6]
+ ldr r9, [r4, +r9, lsr #14]
+ ldr r0, [r3, +r0, lsr #22]
+
+ MEND
+
+
+|CrcUpdateT4@16| PROC
+
+ stmdb sp!, {r4-r11, lr}
+ cmp r2, #0
+ beq |$fin|
+
+|$v1|
+ tst r1, #7
+ beq |$v2|
+ CRC32_STEP_1
+ bne |$v1|
+
+|$v2|
+ cmp r2, #16
+ blo |$v3|
+
+ ldr r10, [r1], #4
+ ldr r11, [r1], #4
+
+ add r4, r3, #0x400
+ add r5, r3, #0x800
+ add r6, r3, #0xC00
+
+ mov r7, #0
+ mov r8, #0
+ mov r9, #0
+
+ sub r2, r2, #16
+
+|$loop|
+ ; pld [r1, #0x40]
+
+ CRC32_STEP_4 r10
+ CRC32_STEP_4 r11
+
+ subs r2, r2, #8
+ bhs |$loop|
+
+ sub r1, r1, #8
+ add r2, r2, #16
+
+ eor r7, r7, r8
+ eor r7, r7, r9
+ eor r0, r0, r7
+
+|$v3|
+ cmp r2, #0
+ beq |$fin|
+
+|$v4|
+ CRC32_STEP_1
+ bne |$v4|
+
+|$fin|
+ ldmia sp!, {r4-r11, pc}
+
+|CrcUpdateT4@16| ENDP
+
+ END
diff --git a/other-licenses/7zstub/src/Asm/x86/7zAsm.asm b/other-licenses/7zstub/src/Asm/x86/7zAsm.asm new file mode 100644 index 000000000..8c30d7b7e --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/7zAsm.asm @@ -0,0 +1,147 @@ +; 7zAsm.asm -- ASM macros
+; 2018-02-03 : Igor Pavlov : Public domain
+
+MY_ASM_START macro
+ ifdef x64
+ .code
+ else
+ .386
+ .model flat
+ _TEXT$00 SEGMENT PARA PUBLIC 'CODE'
+ endif
+endm
+
+MY_PROC macro name:req, numParams:req
+ align 16
+ proc_numParams = numParams
+ ifdef x64
+ proc_name equ name
+ else
+ proc_name equ @CatStr(@,name,@, %numParams * 4)
+ endif
+ proc_name PROC
+endm
+
+MY_ENDP macro
+ ifdef x64
+ ret
+ else
+ if proc_numParams LT 3
+ ret
+ else
+ ret (proc_numParams - 2) * 4
+ endif
+ endif
+ proc_name ENDP
+endm
+
+ifdef x64
+ REG_SIZE equ 8
+ REG_LOGAR_SIZE equ 3
+else
+ REG_SIZE equ 4
+ REG_LOGAR_SIZE equ 2
+endif
+
+ x0 equ EAX
+ x1 equ ECX
+ x2 equ EDX
+ x3 equ EBX
+ x4 equ ESP
+ x5 equ EBP
+ x6 equ ESI
+ x7 equ EDI
+
+ x0_W equ AX
+ x1_W equ CX
+ x2_W equ DX
+ x3_W equ BX
+
+ x5_W equ BP
+ x6_W equ SI
+ x7_W equ DI
+
+ x0_L equ AL
+ x1_L equ CL
+ x2_L equ DL
+ x3_L equ BL
+
+ x0_H equ AH
+ x1_H equ CH
+ x2_H equ DH
+ x3_H equ BH
+
+ifdef x64
+ x5_L equ BPL
+ x6_L equ SIL
+ x7_L equ DIL
+
+ r0 equ RAX
+ r1 equ RCX
+ r2 equ RDX
+ r3 equ RBX
+ r4 equ RSP
+ r5 equ RBP
+ r6 equ RSI
+ r7 equ RDI
+ x8 equ r8d
+ x9 equ r9d
+ x10 equ r10d
+ x11 equ r11d
+ x12 equ r12d
+ x13 equ r13d
+ x14 equ r14d
+ x15 equ r15d
+else
+ r0 equ x0
+ r1 equ x1
+ r2 equ x2
+ r3 equ x3
+ r4 equ x4
+ r5 equ x5
+ r6 equ x6
+ r7 equ x7
+endif
+
+MY_PUSH_4_REGS macro
+ push r3
+ push r5
+ push r6
+ push r7
+endm
+
+MY_POP_4_REGS macro
+ pop r7
+ pop r6
+ pop r5
+ pop r3
+endm
+
+
+ifdef x64
+
+; for WIN64-x64 ABI:
+
+REG_PARAM_0 equ r1
+REG_PARAM_1 equ r2
+REG_PARAM_2 equ r8
+REG_PARAM_3 equ r9
+
+MY_PUSH_PRESERVED_REGS macro
+ MY_PUSH_4_REGS
+ push r12
+ push r13
+ push r14
+ push r15
+endm
+
+
+MY_POP_PRESERVED_REGS macro
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ MY_POP_4_REGS
+endm
+
+endif
diff --git a/other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm b/other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm new file mode 100644 index 000000000..2de51719a --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm @@ -0,0 +1,147 @@ +; 7zCrcOpt.asm -- CRC32 calculation : optimized version
+; 2009-12-12 : Igor Pavlov : Public domain
+
+include 7zAsm.asm
+
+MY_ASM_START
+
+rD equ r2
+rN equ r7
+
+ifdef x64
+ num_VAR equ r8
+ table_VAR equ r9
+else
+ data_size equ (REG_SIZE * 5)
+ crc_table equ (REG_SIZE + data_size)
+ num_VAR equ [r4 + data_size]
+ table_VAR equ [r4 + crc_table]
+endif
+
+SRCDAT equ rN + rD + 4 *
+
+CRC macro op:req, dest:req, src:req, t:req
+ op dest, DWORD PTR [r5 + src * 4 + 0400h * t]
+endm
+
+CRC_XOR macro dest:req, src:req, t:req
+ CRC xor, dest, src, t
+endm
+
+CRC_MOV macro dest:req, src:req, t:req
+ CRC mov, dest, src, t
+endm
+
+CRC1b macro
+ movzx x6, BYTE PTR [rD]
+ inc rD
+ movzx x3, x0_L
+ xor x6, x3
+ shr x0, 8
+ CRC xor, x0, r6, 0
+ dec rN
+endm
+
+MY_PROLOG macro crc_end:req
+ MY_PUSH_4_REGS
+
+ mov x0, x1
+ mov rN, num_VAR
+ mov r5, table_VAR
+ test rN, rN
+ jz crc_end
+ @@:
+ test rD, 7
+ jz @F
+ CRC1b
+ jnz @B
+ @@:
+ cmp rN, 16
+ jb crc_end
+ add rN, rD
+ mov num_VAR, rN
+ sub rN, 8
+ and rN, NOT 7
+ sub rD, rN
+ xor x0, [SRCDAT 0]
+endm
+
+MY_EPILOG macro crc_end:req
+ xor x0, [SRCDAT 0]
+ mov rD, rN
+ mov rN, num_VAR
+ sub rN, rD
+ crc_end:
+ test rN, rN
+ jz @F
+ CRC1b
+ jmp crc_end
+ @@:
+ MY_POP_4_REGS
+endm
+
+MY_PROC CrcUpdateT8, 4
+ MY_PROLOG crc_end_8
+ mov x1, [SRCDAT 1]
+ align 16
+ main_loop_8:
+ mov x6, [SRCDAT 2]
+ movzx x3, x1_L
+ CRC_XOR x6, r3, 3
+ movzx x3, x1_H
+ CRC_XOR x6, r3, 2
+ shr x1, 16
+ movzx x3, x1_L
+ movzx x1, x1_H
+ CRC_XOR x6, r3, 1
+ movzx x3, x0_L
+ CRC_XOR x6, r1, 0
+
+ mov x1, [SRCDAT 3]
+ CRC_XOR x6, r3, 7
+ movzx x3, x0_H
+ shr x0, 16
+ CRC_XOR x6, r3, 6
+ movzx x3, x0_L
+ CRC_XOR x6, r3, 5
+ movzx x3, x0_H
+ CRC_MOV x0, r3, 4
+ xor x0, x6
+ add rD, 8
+ jnz main_loop_8
+
+ MY_EPILOG crc_end_8
+MY_ENDP
+
+MY_PROC CrcUpdateT4, 4
+ MY_PROLOG crc_end_4
+ align 16
+ main_loop_4:
+ movzx x1, x0_L
+ movzx x3, x0_H
+ shr x0, 16
+ movzx x6, x0_H
+ and x0, 0FFh
+ CRC_MOV x1, r1, 3
+ xor x1, [SRCDAT 1]
+ CRC_XOR x1, r3, 2
+ CRC_XOR x1, r6, 0
+ CRC_XOR x1, r0, 1
+
+ movzx x0, x1_L
+ movzx x3, x1_H
+ shr x1, 16
+ movzx x6, x1_H
+ and x1, 0FFh
+ CRC_MOV x0, r0, 3
+ xor x0, [SRCDAT 2]
+ CRC_XOR x0, r3, 2
+ CRC_XOR x0, r6, 0
+ CRC_XOR x0, r1, 1
+ add rD, 8
+ jnz main_loop_4
+
+ MY_EPILOG crc_end_4
+MY_ENDP
+
+end
diff --git a/other-licenses/7zstub/src/Asm/x86/AesOpt.asm b/other-licenses/7zstub/src/Asm/x86/AesOpt.asm new file mode 100644 index 000000000..c32e48f88 --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/AesOpt.asm @@ -0,0 +1,237 @@ +; AesOpt.asm -- Intel's AES.
+; 2009-12-12 : Igor Pavlov : Public domain
+
+include 7zAsm.asm
+
+MY_ASM_START
+
+ifndef x64
+ .xmm
+endif
+
+ifdef x64
+ num equ r8
+else
+ num equ [r4 + REG_SIZE * 4]
+endif
+
+rD equ r2
+rN equ r0
+
+MY_PROLOG macro reg:req
+ ifdef x64
+ movdqa [r4 + 8], xmm6
+ movdqa [r4 + 8 + 16], xmm7
+ endif
+
+ push r3
+ push r5
+ push r6
+
+ mov rN, num
+ mov x6, [r1 + 16]
+ shl x6, 5
+
+ movdqa reg, [r1]
+ add r1, 32
+endm
+
+MY_EPILOG macro
+ pop r6
+ pop r5
+ pop r3
+
+ ifdef x64
+ movdqa xmm6, [r4 + 8]
+ movdqa xmm7, [r4 + 8 + 16]
+ endif
+
+ MY_ENDP
+endm
+
+ways equ 4
+ways16 equ (ways * 16)
+
+OP_W macro op, op2
+ i = 0
+ rept ways
+ op @CatStr(xmm,%i), op2
+ i = i + 1
+ endm
+endm
+
+LOAD_OP macro op:req, offs:req
+ op xmm0, [r1 + r3 offs]
+endm
+
+LOAD_OP_W macro op:req, offs:req
+ movdqa xmm7, [r1 + r3 offs]
+ OP_W op, xmm7
+endm
+
+
+; ---------- AES-CBC Decode ----------
+
+CBC_DEC_UPDATE macro reg, offs
+ pxor reg, xmm6
+ movdqa xmm6, [rD + offs]
+ movdqa [rD + offs], reg
+endm
+
+DECODE macro op:req
+ op aesdec, +16
+ @@:
+ op aesdec, +0
+ op aesdec, -16
+ sub x3, 32
+ jnz @B
+ op aesdeclast, +0
+endm
+
+MY_PROC AesCbc_Decode_Intel, 3
+ MY_PROLOG xmm6
+
+ sub x6, 32
+
+ jmp check2
+
+ align 16
+ nextBlocks2:
+ mov x3, x6
+ OP_W movdqa, [rD + i * 16]
+ LOAD_OP_W pxor, +32
+ DECODE LOAD_OP_W
+ OP_W CBC_DEC_UPDATE, i * 16
+ add rD, ways16
+ check2:
+ sub rN, ways
+ jnc nextBlocks2
+
+ add rN, ways
+ jmp check
+
+ nextBlock:
+ mov x3, x6
+ movdqa xmm1, [rD]
+ LOAD_OP movdqa, +32
+ pxor xmm0, xmm1
+ DECODE LOAD_OP
+ pxor xmm0, xmm6
+ movdqa [rD], xmm0
+ movdqa xmm6, xmm1
+ add rD, 16
+ check:
+ sub rN, 1
+ jnc nextBlock
+
+ movdqa [r1 - 32], xmm6
+ MY_EPILOG
+
+
+; ---------- AES-CBC Encode ----------
+
+ENCODE macro op:req
+ op aesenc, -16
+ @@:
+ op aesenc, +0
+ op aesenc, +16
+ add r3, 32
+ jnz @B
+ op aesenclast, +0
+endm
+
+MY_PROC AesCbc_Encode_Intel, 3
+ MY_PROLOG xmm0
+
+ add r1, r6
+ neg r6
+ add r6, 32
+
+ jmp check_e
+
+ align 16
+ nextBlock_e:
+ mov r3, r6
+ pxor xmm0, [rD]
+ pxor xmm0, [r1 + r3 - 32]
+ ENCODE LOAD_OP
+ movdqa [rD], xmm0
+ add rD, 16
+ check_e:
+ sub rN, 1
+ jnc nextBlock_e
+
+ movdqa [r1 + r6 - 64], xmm0
+ MY_EPILOG
+
+
+; ---------- AES-CTR ----------
+
+XOR_UPD_1 macro reg, offs
+ pxor reg, [rD + offs]
+endm
+
+XOR_UPD_2 macro reg, offs
+ movdqa [rD + offs], reg
+endm
+
+MY_PROC AesCtr_Code_Intel, 3
+ MY_PROLOG xmm6
+
+ mov r5, r4
+ shr r5, 4
+ dec r5
+ shl r5, 4
+
+ mov DWORD PTR [r5], 1
+ mov DWORD PTR [r5 + 4], 0
+ mov DWORD PTR [r5 + 8], 0
+ mov DWORD PTR [r5 + 12], 0
+
+ add r1, r6
+ neg r6
+ add r6, 32
+
+ jmp check2_c
+
+ align 16
+ nextBlocks2_c:
+ movdqa xmm7, [r5]
+
+ i = 0
+ rept ways
+ paddq xmm6, xmm7
+ movdqa @CatStr(xmm,%i), xmm6
+ i = i + 1
+ endm
+
+ mov r3, r6
+ LOAD_OP_W pxor, -32
+ ENCODE LOAD_OP_W
+ OP_W XOR_UPD_1, i * 16
+ OP_W XOR_UPD_2, i * 16
+ add rD, ways16
+ check2_c:
+ sub rN, ways
+ jnc nextBlocks2_c
+
+ add rN, ways
+ jmp check_c
+
+ nextBlock_c:
+ paddq xmm6, [r5]
+ mov r3, r6
+ movdqa xmm0, [r1 + r3 - 32]
+ pxor xmm0, xmm6
+ ENCODE LOAD_OP
+ XOR_UPD_1 xmm0, 0
+ XOR_UPD_2 xmm0, 0
+ add rD, 16
+ check_c:
+ sub rN, 1
+ jnc nextBlock_c
+
+ movdqa [r1 + r6 - 64], xmm6
+ MY_EPILOG
+
+end
diff --git a/other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm b/other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm new file mode 100644 index 000000000..0a89eb735 --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm @@ -0,0 +1,1258 @@ +; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
+; 2018-02-06: Igor Pavlov : Public domain
+;
+; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
+; function for check at link time.
+; That code is tightly coupled with LzmaDec_TryDummy()
+; and with another functions in LzmaDec.c file.
+; CLzmaDec structure, (probs) array layout, input and output of
+; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
+
+ifndef x64
+; x64=1
+; .err <x64_IS_REQUIRED>
+endif
+
+include 7zAsm.asm
+
+MY_ASM_START
+
+_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
+
+MY_ALIGN macro num:req
+ align num
+endm
+
+MY_ALIGN_16 macro
+ MY_ALIGN 16
+endm
+
+MY_ALIGN_32 macro
+ MY_ALIGN 32
+endm
+
+MY_ALIGN_64 macro
+ MY_ALIGN 64
+endm
+
+
+; _LZMA_SIZE_OPT equ 1
+
+; _LZMA_PROB32 equ 1
+
+ifdef _LZMA_PROB32
+ PSHIFT equ 2
+ PLOAD macro dest, mem
+ mov dest, dword ptr [mem]
+ endm
+ PSTORE macro src, mem
+ mov dword ptr [mem], src
+ endm
+else
+ PSHIFT equ 1
+ PLOAD macro dest, mem
+ movzx dest, word ptr [mem]
+ endm
+ PSTORE macro src, mem
+ mov word ptr [mem], @CatStr(src, _W)
+ endm
+endif
+
+PMULT equ (1 SHL PSHIFT)
+PMULT_HALF equ (1 SHL (PSHIFT - 1))
+PMULT_2 equ (1 SHL (PSHIFT + 1))
+
+
+; x0 range
+; x1 pbPos / (prob) TREE
+; x2 probBranch / prm (MATCHED) / pbPos / cnt
+; x3 sym
+;====== r4 === RSP
+; x5 cod
+; x6 t1 NORM_CALC / probs_state / dist
+; x7 t0 NORM_CALC / prob2 IF_BIT_1
+; x8 state
+; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg
+; x10 kBitModelTotal_reg
+; r11 probs
+; x12 offs (MATCHED) / dic / len_temp
+; x13 processedPos
+; x14 bit (MATCHED) / dicPos
+; r15 buf
+
+
+cod equ x5
+cod_L equ x5_L
+range equ x0
+state equ x8
+state_R equ r8
+buf equ r15
+processedPos equ x13
+kBitModelTotal_reg equ x10
+
+probBranch equ x2
+probBranch_R equ r2
+probBranch_W equ x2_W
+
+pbPos equ x1
+pbPos_R equ r1
+
+cnt equ x2
+cnt_R equ r2
+
+lpMask_reg equ x9
+dicPos equ r14
+
+sym equ x3
+sym_R equ r3
+sym_L equ x3_L
+
+probs equ r11
+dic equ r12
+
+t0 equ x7
+t0_W equ x7_W
+t0_R equ r7
+
+prob2 equ t0
+prob2_W equ t0_W
+
+t1 equ x6
+t1_R equ r6
+
+probs_state equ t1
+probs_state_R equ t1_R
+
+prm equ r2
+match equ x9
+match_R equ r9
+offs equ x12
+offs_R equ r12
+bit equ x14
+bit_R equ r14
+
+sym2 equ x9
+sym2_R equ r9
+
+len_temp equ x12
+
+dist equ sym
+dist2 equ x9
+
+
+
+kNumBitModelTotalBits equ 11
+kBitModelTotal equ (1 SHL kNumBitModelTotalBits)
+kNumMoveBits equ 5
+kBitModelOffset equ ((1 SHL kNumMoveBits) - 1)
+kTopValue equ (1 SHL 24)
+
+NORM_2 macro
+ ; movzx t0, BYTE PTR [buf]
+ shl cod, 8
+ mov cod_L, BYTE PTR [buf]
+ shl range, 8
+ ; or cod, t0
+ inc buf
+endm
+
+
+NORM macro
+ cmp range, kTopValue
+ jae SHORT @F
+ NORM_2
+@@:
+endm
+
+
+; ---------- Branch MACROS ----------
+
+UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
+ mov prob2, kBitModelTotal_reg
+ sub prob2, probBranch
+ shr prob2, kNumMoveBits
+ add probBranch, prob2
+ PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT
+endm
+
+
+UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
+ sub prob2, range
+ sub cod, range
+ mov range, prob2
+ mov prob2, probBranch
+ shr probBranch, kNumMoveBits
+ sub prob2, probBranch
+ PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT
+endm
+
+
+CMP_COD macro probsArray:req, probOffset:req, probDisp:req
+ PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT
+ NORM
+ mov prob2, range
+ shr range, kNumBitModelTotalBits
+ imul range, probBranch
+ cmp cod, range
+endm
+
+
+IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ CMP_COD probsArray, probOffset, probDisp
+ jae toLabel
+endm
+
+
+IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
+ UPDATE_0 probsArray, probOffset, probDisp
+endm
+
+
+IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
+ CMP_COD probsArray, probOffset, probDisp
+ jb toLabel
+endm
+
+
+; ---------- CMOV MACROS ----------
+
+NORM_CALC macro prob:req
+ NORM
+ mov t0, range
+ shr range, kNumBitModelTotalBits
+ imul range, prob
+ sub t0, range
+ mov t1, cod
+ sub cod, range
+endm
+
+
+PUP macro prob:req, probPtr:req
+ sub t0, prob
+ ; only sar works for both 16/32 bit prob modes
+ sar t0, kNumMoveBits
+ add t0, prob
+ PSTORE t0, probPtr
+endm
+
+
+PUP_SUB macro prob:req, probPtr:req, symSub:req
+ sbb sym, symSub
+ PUP prob, probPtr
+endm
+
+
+PUP_COD macro prob:req, probPtr:req, symSub:req
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ mov t1, sym
+ cmovb t0, kBitModelTotal_reg
+ PUP_SUB prob, probPtr, symSub
+endm
+
+
+BIT_0 macro prob:req, probNext:req
+ PLOAD prob, probs + 1 * PMULT
+ PLOAD probNext, probs + 1 * PMULT_2
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, probs + 1 * PMULT_2 + PMULT
+ cmovae probNext, t0
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ cmovb t0, kBitModelTotal_reg
+ mov sym, 2
+ PUP_SUB prob, probs + 1 * PMULT, 0 - 1
+endm
+
+
+BIT_1 macro prob:req, probNext:req
+ PLOAD probNext, probs + sym_R * PMULT_2
+ add sym, sym
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, probs + sym_R * PMULT + PMULT
+ cmovae probNext, t0
+ PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
+endm
+
+
+BIT_2 macro prob:req, symSub:req
+ add sym, sym
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
+endm
+
+
+; ---------- MATCHED LITERAL ----------
+
+LITM_0 macro
+ mov offs, 256 * PMULT
+ shl match, (PSHIFT + 1)
+ mov bit, offs
+ and bit, match
+ PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
+ lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
+ ; lea prm, [probs + 256 * PMULT + 1 * PMULT]
+ ; add prm, bit_R
+ xor offs, bit
+ add match, match
+
+ NORM_CALC x1
+
+ cmovae offs, bit
+ mov bit, match
+ cmovae range, t0
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ cmovb t0, kBitModelTotal_reg
+ mov sym, 0
+ PUP_SUB x1, prm, -2-1
+endm
+
+
+LITM macro
+ and bit, offs
+ lea prm, [probs + offs_R * 1]
+ add prm, bit_R
+ PLOAD x1, prm + sym_R * PMULT
+ xor offs, bit
+ add sym, sym
+ add match, match
+
+ NORM_CALC x1
+
+ cmovae offs, bit
+ mov bit, match
+ cmovae range, t0
+ PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
+endm
+
+
+LITM_2 macro
+ and bit, offs
+ lea prm, [probs + offs_R * 1]
+ add prm, bit_R
+ PLOAD x1, prm + sym_R * PMULT
+ add sym, sym
+
+ NORM_CALC x1
+
+ cmovae range, t0
+ PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
+endm
+
+
+; ---------- REVERSE BITS ----------
+
+REV_0 macro prob:req, probNext:req
+ ; PLOAD prob, probs + 1 * PMULT
+ ; lea sym2_R, [probs + 2 * PMULT]
+ ; PLOAD probNext, probs + 2 * PMULT
+ PLOAD probNext, sym2_R
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, probs + 3 * PMULT
+ cmovae probNext, t0
+ cmovb cod, t1
+ mov t0, kBitModelOffset
+ cmovb t0, kBitModelTotal_reg
+ lea t1_R, [probs + 3 * PMULT]
+ cmovae sym2_R, t1_R
+ PUP prob, probs + 1 * PMULT
+endm
+
+
+REV_1 macro prob:req, probNext:req, step:req
+ add sym2_R, step * PMULT
+ PLOAD probNext, sym2_R
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ PLOAD t0, sym2_R + step * PMULT
+ cmovae probNext, t0
+ cmovb cod, t1
+ mov t0, kBitModelOffset
+ cmovb t0, kBitModelTotal_reg
+ lea t1_R, [sym2_R + step * PMULT]
+ cmovae sym2_R, t1_R
+ PUP prob, t1_R - step * PMULT_2
+endm
+
+
+REV_2 macro prob:req, step:req
+ sub sym2_R, probs
+ shr sym2, PSHIFT
+ or sym, sym2
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ lea t0, [sym - step]
+ cmovb sym, t0
+ cmovb cod, t1
+ mov t0, kBitModelOffset
+ cmovb t0, kBitModelTotal_reg
+ PUP prob, probs + sym2_R * PMULT
+endm
+
+
+REV_1_VAR macro prob:req
+ PLOAD prob, sym_R
+ mov probs, sym_R
+ add sym_R, sym2_R
+
+ NORM_CALC prob
+
+ cmovae range, t0
+ lea t0_R, [sym_R + sym2_R]
+ cmovae sym_R, t0_R
+ mov t0, kBitModelOffset
+ cmovb cod, t1
+ ; mov t1, kBitModelTotal
+ ; cmovb t0, t1
+ cmovb t0, kBitModelTotal_reg
+ add sym2, sym2
+ PUP prob, probs
+endm
+
+
+
+
+LIT_PROBS macro lpMaskParam:req
+ ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
+ mov t0, processedPos
+ shl t0, 8
+ add sym, t0
+ and sym, lpMaskParam
+ add probs_state_R, pbPos_R
+ mov x1, LOC lc2
+ lea sym, dword ptr[sym_R + 2 * sym_R]
+ add probs, Literal * PMULT
+ shl sym, x1_L
+ add probs, sym_R
+ UPDATE_0 probs_state_R, 0, IsMatch
+ inc processedPos
+endm
+
+
+
+kNumPosBitsMax equ 4
+kNumPosStatesMax equ (1 SHL kNumPosBitsMax)
+
+kLenNumLowBits equ 3
+kLenNumLowSymbols equ (1 SHL kLenNumLowBits)
+kLenNumHighBits equ 8
+kLenNumHighSymbols equ (1 SHL kLenNumHighBits)
+kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
+
+LenLow equ 0
+LenChoice equ LenLow
+LenChoice2 equ (LenLow + kLenNumLowSymbols)
+LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
+
+kNumStates equ 12
+kNumStates2 equ 16
+kNumLitStates equ 7
+
+kStartPosModelIndex equ 4
+kEndPosModelIndex equ 14
+kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1))
+
+kNumPosSlotBits equ 6
+kNumLenToPosStates equ 4
+
+kNumAlignBits equ 4
+kAlignTableSize equ (1 SHL kNumAlignBits)
+
+kMatchMinLen equ 2
+kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
+
+kStartOffset equ 1664
+SpecPos equ (-kStartOffset)
+IsRep0Long equ (SpecPos + kNumFullDistances)
+RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
+LenCoder equ (RepLenCoder + kNumLenProbs)
+IsMatch equ (LenCoder + kNumLenProbs)
+kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
+IsRep equ (kAlign + kAlignTableSize)
+IsRepG0 equ (IsRep + kNumStates)
+IsRepG1 equ (IsRepG0 + kNumStates)
+IsRepG2 equ (IsRepG1 + kNumStates)
+PosSlot equ (IsRepG2 + kNumStates)
+Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
+NUM_BASE_PROBS equ (Literal + kStartOffset)
+
+if kAlign ne 0
+ .err <Stop_Compiling_Bad_LZMA_kAlign>
+endif
+
+if NUM_BASE_PROBS ne 1984
+ .err <Stop_Compiling_Bad_LZMA_PROBS>
+endif
+
+
+PTR_FIELD equ dq ?
+
+CLzmaDec_Asm struct
+ lc db ?
+ lp db ?
+ pb db ?
+ _pad_ db ?
+ dicSize dd ?
+
+ probs_Spec PTR_FIELD
+ probs_1664 PTR_FIELD
+ dic_Spec PTR_FIELD
+ dicBufSize PTR_FIELD
+ dicPos_Spec PTR_FIELD
+ buf_Spec PTR_FIELD
+
+ range_Spec dd ?
+ code_Spec dd ?
+ processedPos_Spec dd ?
+ checkDicSize dd ?
+ rep0 dd ?
+ rep1 dd ?
+ rep2 dd ?
+ rep3 dd ?
+ state_Spec dd ?
+ remainLen dd ?
+CLzmaDec_Asm ends
+
+
+CLzmaDec_Asm_Loc struct
+ OLD_RSP PTR_FIELD
+ lzmaPtr PTR_FIELD
+ _pad0_ PTR_FIELD
+ _pad1_ PTR_FIELD
+ _pad2_ PTR_FIELD
+ dicBufSize PTR_FIELD
+ probs_Spec PTR_FIELD
+ dic_Spec PTR_FIELD
+
+ limit PTR_FIELD
+ bufLimit PTR_FIELD
+ lc2 dd ?
+ lpMask dd ?
+ pbMask dd ?
+ checkDicSize dd ?
+
+ _pad_ dd ?
+ remainLen dd ?
+ dicPos_Spec PTR_FIELD
+ rep0 dd ?
+ rep1 dd ?
+ rep2 dd ?
+ rep3 dd ?
+CLzmaDec_Asm_Loc ends
+
+
+GLOB_2 equ [sym_R].CLzmaDec_Asm.
+GLOB equ [r1].CLzmaDec_Asm.
+LOC_0 equ [r0].CLzmaDec_Asm_Loc.
+LOC equ [RSP].CLzmaDec_Asm_Loc.
+
+
+COPY_VAR macro name
+ mov t0, GLOB_2 name
+ mov LOC_0 name, t0
+endm
+
+
+RESTORE_VAR macro name
+ mov t0, LOC name
+ mov GLOB name, t0
+endm
+
+
+
+IsMatchBranch_Pre macro reg
+ ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
+ mov pbPos, LOC pbMask
+ and pbPos, processedPos
+ shl pbPos, (kLenNumLowBits + 1 + PSHIFT)
+ lea probs_state_R, [probs + state_R]
+endm
+
+
+IsMatchBranch macro reg
+ IsMatchBranch_Pre
+ IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
+endm
+
+
+CheckLimits macro reg
+ cmp buf, LOC bufLimit
+ jae fin_OK
+ cmp dicPos, LOC limit
+ jae fin_OK
+endm
+
+
+
+; RSP is (16x + 8) bytes aligned in WIN64-x64
+; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
+
+PARAM_lzma equ REG_PARAM_0
+PARAM_limit equ REG_PARAM_1
+PARAM_bufLimit equ REG_PARAM_2
+
+; MY_ALIGN_64
+MY_PROC LzmaDec_DecodeReal_3, 3
+MY_PUSH_PRESERVED_REGS
+
+ lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
+ and r0, -128
+ mov r5, RSP
+ mov RSP, r0
+ mov LOC_0 Old_RSP, r5
+ mov LOC_0 lzmaPtr, PARAM_lzma
+
+ mov LOC_0 remainLen, 0 ; remainLen must be ZERO
+
+ mov LOC_0 bufLimit, PARAM_bufLimit
+ mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2
+ mov dic, GLOB_2 dic_Spec
+ add PARAM_limit, dic
+ mov LOC_0 limit, PARAM_limit
+
+ COPY_VAR(rep0)
+ COPY_VAR(rep1)
+ COPY_VAR(rep2)
+ COPY_VAR(rep3)
+
+ mov dicPos, GLOB_2 dicPos_Spec
+ add dicPos, dic
+ mov LOC_0 dicPos_Spec, dicPos
+ mov LOC_0 dic_Spec, dic
+
+ mov x1_L, GLOB_2 pb
+ mov t0, 1
+ shl t0, x1_L
+ dec t0
+ mov LOC_0 pbMask, t0
+
+ ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
+ ; unsigned lc = p->prop.lc;
+ ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
+
+ mov x1_L, GLOB_2 lc
+ mov x2, 100h
+ mov t0, x2
+ shr x2, x1_L
+ ; inc x1
+ add x1_L, PSHIFT
+ mov LOC_0 lc2, x1
+ mov x1_L, GLOB_2 lp
+ shl t0, x1_L
+ sub t0, x2
+ mov LOC_0 lpMask, t0
+ mov lpMask_reg, t0
+
+ ; mov probs, GLOB_2 probs_Spec
+ ; add probs, kStartOffset SHL PSHIFT
+ mov probs, GLOB_2 probs_1664
+ mov LOC_0 probs_Spec, probs
+
+ mov t0_R, GLOB_2 dicBufSize
+ mov LOC_0 dicBufSize, t0_R
+
+ mov x1, GLOB_2 checkDicSize
+ mov LOC_0 checkDicSize, x1
+
+ mov processedPos, GLOB_2 processedPos_Spec
+
+ mov state, GLOB_2 state_Spec
+ shl state, PSHIFT
+
+ mov buf, GLOB_2 buf_Spec
+ mov range, GLOB_2 range_Spec
+ mov cod, GLOB_2 code_Spec
+ mov kBitModelTotal_reg, kBitModelTotal
+ xor sym, sym
+
+ ; if (processedPos != 0 || checkDicSize != 0)
+ or x1, processedPos
+ jz @f
+
+ add t0_R, dic
+ cmp dicPos, dic
+ cmovnz t0_R, dicPos
+ movzx sym, byte ptr[t0_R - 1]
+
+@@:
+ IsMatchBranch_Pre
+ cmp state, 4 * PMULT
+ jb lit_end
+ cmp state, kNumLitStates * PMULT
+ jb lit_matched_end
+ jmp lz_end
+
+
+
+
+; ---------- LITERAL ----------
+MY_ALIGN_64
+lit_start:
+ xor state, state
+lit_start_2:
+ LIT_PROBS lpMask_reg
+
+ ifdef _LZMA_SIZE_OPT
+
+ PLOAD x1, probs + 1 * PMULT
+ mov sym, 1
+MY_ALIGN_16
+lit_loop:
+ BIT_1 x1, x2
+ mov x1, x2
+ cmp sym, 127
+ jbe lit_loop
+
+ else
+
+ BIT_0 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+
+ endif
+
+ BIT_2 x2, 256 - 1
+
+ ; mov dic, LOC dic_Spec
+ mov probs, LOC probs_Spec
+ IsMatchBranch_Pre
+ mov byte ptr[dicPos], sym_L
+ inc dicPos
+
+ CheckLimits
+lit_end:
+ IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
+
+ ; jmp IsMatch_label
+
+; ---------- MATCHES ----------
+; MY_ALIGN_32
+IsMatch_label:
+ UPDATE_1 probs_state_R, pbPos_R, IsMatch
+ IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
+
+ add probs, LenCoder * PMULT
+ add state, kNumStates * PMULT
+
+; ---------- LEN DECODE ----------
+len_decode:
+ mov len_temp, 8 - 1 - kMatchMinLen
+ IF_BIT_0_NOUP probs, 0, 0, len_mid_0
+ UPDATE_1 probs, 0, 0
+ add probs, (1 SHL (kLenNumLowBits + PSHIFT))
+ mov len_temp, -1 - kMatchMinLen
+ IF_BIT_0_NOUP probs, 0, 0, len_mid_0
+ UPDATE_1 probs, 0, 0
+ add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
+ mov sym, 1
+ PLOAD x1, probs + 1 * PMULT
+
+MY_ALIGN_32
+len8_loop:
+ BIT_1 x1, x2
+ mov x1, x2
+ cmp sym, 64
+ jb len8_loop
+
+ mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
+ jmp len_mid_2
+
+MY_ALIGN_32
+len_mid_0:
+ UPDATE_0 probs, 0, 0
+ add probs, pbPos_R
+ BIT_0 x2, x1
+len_mid_2:
+ BIT_1 x1, x2
+ BIT_2 x2, len_temp
+ mov probs, LOC probs_Spec
+ cmp state, kNumStates * PMULT
+ jb copy_match
+
+
+; ---------- DECODE DISTANCE ----------
+ ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
+
+ mov t0, 3 + kMatchMinLen
+ cmp sym, 3 + kMatchMinLen
+ cmovb t0, sym
+ add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
+ shl t0, (kNumPosSlotBits + PSHIFT)
+ add probs, t0_R
+
+ ; sym = Len
+ ; mov LOC remainLen, sym
+ mov len_temp, sym
+
+ ifdef _LZMA_SIZE_OPT
+
+ PLOAD x1, probs + 1 * PMULT
+ mov sym, 1
+MY_ALIGN_16
+slot_loop:
+ BIT_1 x1, x2
+ mov x1, x2
+ cmp sym, 32
+ jb slot_loop
+
+ else
+
+ BIT_0 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+ BIT_1 x2, x1
+ BIT_1 x1, x2
+
+ endif
+
+ mov x1, sym
+ BIT_2 x2, 64-1
+
+ and sym, 3
+ mov probs, LOC probs_Spec
+ cmp x1, 32 + kEndPosModelIndex / 2
+ jb short_dist
+
+ ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
+ sub x1, (32 + 1 + kNumAlignBits)
+ ; distance = (2 | (distance & 1));
+ or sym, 2
+ PLOAD x2, probs + 1 * PMULT
+ shl sym, kNumAlignBits + 1
+ lea sym2_R, [probs + 2 * PMULT]
+
+ jmp direct_norm
+ ; lea t1, [sym_R + (1 SHL kNumAlignBits)]
+ ; cmp range, kTopValue
+ ; jb direct_norm
+
+; ---------- DIRECT DISTANCE ----------
+MY_ALIGN_32
+direct_loop:
+ shr range, 1
+ mov t0, cod
+ sub cod, range
+ cmovs cod, t0
+ cmovns sym, t1
+
+ comment ~
+ sub cod, range
+ mov x2, cod
+ sar x2, 31
+ lea sym, dword ptr [r2 + sym_R * 2 + 1]
+ and x2, range
+ add cod, x2
+ ~
+ dec x1
+ je direct_end
+
+ add sym, sym
+direct_norm:
+ lea t1, [sym_R + (1 SHL kNumAlignBits)]
+ cmp range, kTopValue
+ jae near ptr direct_loop
+ ; we align for 32 here with "near ptr" command above
+ NORM_2
+ jmp direct_loop
+
+MY_ALIGN_32
+direct_end:
+ ; prob = + kAlign;
+ ; distance <<= kNumAlignBits;
+ REV_0 x2, x1
+ REV_1 x1, x2, 2
+ REV_1 x2, x1, 4
+ REV_2 x1, 8
+
+decode_dist_end:
+
+ ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
+
+ mov t0, LOC checkDicSize
+ test t0, t0
+ cmove t0, processedPos
+ cmp sym, t0
+ jae end_of_payload
+
+ ; rep3 = rep2;
+ ; rep2 = rep1;
+ ; rep1 = rep0;
+ ; rep0 = distance + 1;
+
+ inc sym
+ mov t0, LOC rep0
+ mov t1, LOC rep1
+ mov x1, LOC rep2
+ mov LOC rep0, sym
+ ; mov sym, LOC remainLen
+ mov sym, len_temp
+ mov LOC rep1, t0
+ mov LOC rep2, t1
+ mov LOC rep3, x1
+
+ ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
+ cmp state, (kNumStates + kNumLitStates) * PMULT
+ mov state, kNumLitStates * PMULT
+ mov t0, (kNumLitStates + 3) * PMULT
+ cmovae state, t0
+
+
+; ---------- COPY MATCH ----------
+copy_match:
+
+ ; len += kMatchMinLen;
+ ; add sym, kMatchMinLen
+
+ ; if ((rem = limit - dicPos) == 0)
+ ; {
+ ; p->dicPos = dicPos;
+ ; return SZ_ERROR_DATA;
+ ; }
+ mov cnt_R, LOC limit
+ sub cnt_R, dicPos
+ jz fin_ERROR
+
+ ; curLen = ((rem < len) ? (unsigned)rem : len);
+ cmp cnt_R, sym_R
+ ; cmovae cnt_R, sym_R ; 64-bit
+ cmovae cnt, sym ; 32-bit
+
+ mov dic, LOC dic_Spec
+ mov x1, LOC rep0
+
+ mov t0_R, dicPos
+ add dicPos, cnt_R
+ ; processedPos += curLen;
+ add processedPos, cnt
+ ; len -= curLen;
+ sub sym, cnt
+ mov LOC remainLen, sym
+
+ sub t0_R, dic
+
+ ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
+ sub t0_R, r1
+ jae @f
+
+ mov r1, LOC dicBufSize
+ add t0_R, r1
+ sub r1, t0_R
+ cmp cnt_R, r1
+ ja copy_match_cross
+@@:
+ ; if (curLen <= dicBufSize - pos)
+
+; ---------- COPY MATCH FAST ----------
+ ; Byte *dest = dic + dicPos;
+ ; mov r1, dic
+ ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
+ ; sub t0_R, dicPos
+ ; dicPos += curLen;
+
+ ; const Byte *lim = dest + curLen;
+ add t0_R, dic
+ movzx sym, byte ptr[t0_R]
+ add t0_R, cnt_R
+ neg cnt_R
+ ; lea r1, [dicPos - 1]
+copy_common:
+ dec dicPos
+ ; cmp LOC rep0, 1
+ ; je rep0Label
+
+ ; t0_R - src_lim
+ ; r1 - dest_lim - 1
+ ; cnt_R - (-cnt)
+
+ IsMatchBranch_Pre
+ inc cnt_R
+ jz copy_end
+MY_ALIGN_16
+@@:
+ mov byte ptr[cnt_R * 1 + dicPos], sym_L
+ movzx sym, byte ptr[cnt_R * 1 + t0_R]
+ inc cnt_R
+ jnz @b
+
+copy_end:
+lz_end_match:
+ mov byte ptr[dicPos], sym_L
+ inc dicPos
+
+ ; IsMatchBranch_Pre
+ CheckLimits
+lz_end:
+ IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
+
+
+
+; ---------- LITERAL MATCHED ----------
+
+ LIT_PROBS LOC lpMask
+
+ ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+ mov x1, LOC rep0
+ ; mov dic, LOC dic_Spec
+ mov LOC dicPos_Spec, dicPos
+
+ ; state -= (state < 10) ? 3 : 6;
+ lea t0, [state_R - 6 * PMULT]
+ sub state, 3 * PMULT
+ cmp state, 7 * PMULT
+ cmovae state, t0
+
+ sub dicPos, dic
+ sub dicPos, r1
+ jae @f
+ add dicPos, LOC dicBufSize
+@@:
+ comment ~
+ xor t0, t0
+ sub dicPos, r1
+ cmovb t0_R, LOC dicBufSize
+ ~
+
+ movzx match, byte ptr[dic + dicPos * 1]
+
+ ifdef _LZMA_SIZE_OPT
+
+ mov offs, 256 * PMULT
+ shl match, (PSHIFT + 1)
+ mov bit, match
+ mov sym, 1
+MY_ALIGN_16
+litm_loop:
+ LITM
+ cmp sym, 256
+ jb litm_loop
+ sub sym, 256
+
+ else
+
+ LITM_0
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM
+ LITM_2
+
+ endif
+
+ mov probs, LOC probs_Spec
+ IsMatchBranch_Pre
+ ; mov dic, LOC dic_Spec
+ mov dicPos, LOC dicPos_Spec
+ mov byte ptr[dicPos], sym_L
+ inc dicPos
+
+ CheckLimits
+lit_matched_end:
+ IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
+ ; IsMatchBranch
+ mov lpMask_reg, LOC lpMask
+ sub state, 3 * PMULT
+ jmp lit_start_2
+
+
+
+; ---------- REP 0 LITERAL ----------
+MY_ALIGN_32
+IsRep0Short_label:
+ UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
+
+ ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
+ mov dic, LOC dic_Spec
+ mov t0_R, dicPos
+ mov probBranch, LOC rep0
+ sub t0_R, dic
+
+ sub probs, RepLenCoder * PMULT
+ inc processedPos
+ ; state = state < kNumLitStates ? 9 : 11;
+ or state, 1 * PMULT
+ IsMatchBranch_Pre
+
+ sub t0_R, probBranch_R
+ jae @f
+ add t0_R, LOC dicBufSize
+@@:
+ movzx sym, byte ptr[dic + t0_R * 1]
+ jmp lz_end_match
+
+
+MY_ALIGN_32
+IsRep_label:
+ UPDATE_1 probs_state_R, 0, IsRep
+
+ ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
+ ; So we don't check it here.
+
+ ; mov t0, processedPos
+ ; or t0, LOC checkDicSize
+ ; jz fin_ERROR_2
+
+ ; state = state < kNumLitStates ? 8 : 11;
+ cmp state, kNumLitStates * PMULT
+ mov state, 8 * PMULT
+ mov probBranch, 11 * PMULT
+ cmovae state, probBranch
+
+ ; prob = probs + RepLenCoder;
+ add probs, RepLenCoder * PMULT
+
+ IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
+ IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
+ UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
+ jmp len_decode
+
+MY_ALIGN_32
+IsRepG0_label:
+ UPDATE_1 probs_state_R, 0, IsRepG0
+ mov dist2, LOC rep0
+ mov dist, LOC rep1
+ mov LOC rep1, dist2
+
+ IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
+ mov LOC rep0, dist
+ jmp len_decode
+
+; MY_ALIGN_32
+IsRepG1_label:
+ UPDATE_1 probs_state_R, 0, IsRepG1
+ mov dist2, LOC rep2
+ mov LOC rep2, dist
+
+ IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
+ mov LOC rep0, dist2
+ jmp len_decode
+
+; MY_ALIGN_32
+IsRepG2_label:
+ UPDATE_1 probs_state_R, 0, IsRepG2
+ mov dist, LOC rep3
+ mov LOC rep3, dist2
+ mov LOC rep0, dist
+ jmp len_decode
+
+
+
+; ---------- SPEC SHORT DISTANCE ----------
+
+MY_ALIGN_32
+short_dist:
+ sub x1, 32 + 1
+ jbe decode_dist_end
+ or sym, 2
+ shl sym, x1_L
+ lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
+ mov sym2, PMULT ; step
+MY_ALIGN_32
+spec_loop:
+ REV_1_VAR x2
+ dec x1
+ jnz spec_loop
+
+ mov probs, LOC probs_Spec
+ sub sym, sym2
+ sub sym, SpecPos * PMULT
+ sub sym_R, probs
+ shr sym, PSHIFT
+
+ jmp decode_dist_end
+
+
+; ---------- COPY MATCH CROSS ----------
+copy_match_cross:
+ ; t0_R - src pos
+ ; r1 - len to dicBufSize
+ ; cnt_R - total copy len
+
+ mov t1_R, t0_R ; srcPos
+ mov t0_R, dic
+ mov r1, LOC dicBufSize ;
+ neg cnt_R
+@@:
+ movzx sym, byte ptr[t1_R * 1 + t0_R]
+ inc t1_R
+ mov byte ptr[cnt_R * 1 + dicPos], sym_L
+ inc cnt_R
+ cmp t1_R, r1
+ jne @b
+
+ movzx sym, byte ptr[t0_R]
+ sub t0_R, cnt_R
+ jmp copy_common
+
+
+
+
+fin_ERROR:
+ mov LOC remainLen, len_temp
+; fin_ERROR_2:
+ mov sym, 1
+ jmp fin
+
+end_of_payload:
+ cmp sym, 0FFFFFFFFh ; -1
+ jne fin_ERROR
+
+ mov LOC remainLen, kMatchSpecLenStart
+ sub state, kNumStates * PMULT
+
+fin_OK:
+ xor sym, sym
+
+fin:
+ NORM
+
+ mov r1, LOC lzmaPtr
+
+ sub dicPos, LOC dic_Spec
+ mov GLOB dicPos_Spec, dicPos
+ mov GLOB buf_Spec, buf
+ mov GLOB range_Spec, range
+ mov GLOB code_Spec, cod
+ shr state, PSHIFT
+ mov GLOB state_Spec, state
+ mov GLOB processedPos_Spec, processedPos
+
+ RESTORE_VAR(remainLen)
+ RESTORE_VAR(rep0)
+ RESTORE_VAR(rep1)
+ RESTORE_VAR(rep2)
+ RESTORE_VAR(rep3)
+
+ mov x0, sym
+
+ mov RSP, LOC Old_RSP
+
+MY_POP_PRESERVED_REGS
+MY_ENDP
+
+_TEXT$LZMADECOPT ENDS
+
+end
diff --git a/other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm b/other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm new file mode 100644 index 000000000..3e6d49026 --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm @@ -0,0 +1,205 @@ +; XzCrc64Opt.asm -- CRC64 calculation : optimized version
+; 2011-06-28 : Igor Pavlov : Public domain
+
+include 7zAsm.asm
+
+MY_ASM_START
+
+ifdef x64
+
+ rD equ r9
+ rN equ r10
+
+ num_VAR equ r8
+ table_VAR equ r9
+
+ SRCDAT equ rN + rD
+
+CRC_XOR macro dest:req, src:req, t:req
+ xor dest, QWORD PTR [r5 + src * 8 + 0800h * t]
+endm
+
+CRC1b macro
+ movzx x6, BYTE PTR [rD]
+ inc rD
+ movzx x3, x0_L
+ xor x6, x3
+ shr r0, 8
+ CRC_XOR r0, r6, 0
+ dec rN
+endm
+
+MY_PROLOG macro crc_end:req
+ MY_PUSH_4_REGS
+
+ mov r0, r1
+ mov rN, num_VAR
+ mov r5, table_VAR
+ mov rD, r2
+ test rN, rN
+ jz crc_end
+ @@:
+ test rD, 3
+ jz @F
+ CRC1b
+ jnz @B
+ @@:
+ cmp rN, 8
+ jb crc_end
+ add rN, rD
+ mov num_VAR, rN
+ sub rN, 4
+ and rN, NOT 3
+ sub rD, rN
+ mov x1, [SRCDAT]
+ xor r0, r1
+ add rN, 4
+endm
+
+MY_EPILOG macro crc_end:req
+ sub rN, 4
+ mov x1, [SRCDAT]
+ xor r0, r1
+ mov rD, rN
+ mov rN, num_VAR
+ sub rN, rD
+ crc_end:
+ test rN, rN
+ jz @F
+ CRC1b
+ jmp crc_end
+ @@:
+ MY_POP_4_REGS
+endm
+
+MY_PROC XzCrc64UpdateT4, 4
+ MY_PROLOG crc_end_4
+ align 16
+ main_loop_4:
+ mov x1, [SRCDAT]
+ movzx x2, x0_L
+ movzx x3, x0_H
+ shr r0, 16
+ movzx x6, x0_L
+ movzx x7, x0_H
+ shr r0, 16
+ CRC_XOR r1, r2, 3
+ CRC_XOR r0, r3, 2
+ CRC_XOR r1, r6, 1
+ CRC_XOR r0, r7, 0
+ xor r0, r1
+
+ add rD, 4
+ jnz main_loop_4
+
+ MY_EPILOG crc_end_4
+MY_ENDP
+
+else
+
+ rD equ r1
+ rN equ r7
+
+ crc_val equ (REG_SIZE * 5)
+ crc_table equ (8 + crc_val)
+ table_VAR equ [r4 + crc_table]
+ num_VAR equ table_VAR
+
+
+ SRCDAT equ rN + rD
+
+CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
+ op0 dest0, DWORD PTR [r5 + src * 8 + 0800h * t]
+ op1 dest1, DWORD PTR [r5 + src * 8 + 0800h * t + 4]
+endm
+
+CRC_XOR macro dest0:req, dest1:req, src:req, t:req
+ CRC xor, xor, dest0, dest1, src, t
+endm
+
+
+CRC1b macro
+ movzx x6, BYTE PTR [rD]
+ inc rD
+ movzx x3, x0_L
+ xor x6, x3
+ shrd r0, r2, 8
+ shr r2, 8
+ CRC_XOR r0, r2, r6, 0
+ dec rN
+endm
+
+MY_PROLOG macro crc_end:req
+ MY_PUSH_4_REGS
+
+ mov rN, r2
+
+ mov x0, [r4 + crc_val]
+ mov x2, [r4 + crc_val + 4]
+ mov r5, table_VAR
+ test rN, rN
+ jz crc_end
+ @@:
+ test rD, 3
+ jz @F
+ CRC1b
+ jnz @B
+ @@:
+ cmp rN, 8
+ jb crc_end
+ add rN, rD
+
+ mov num_VAR, rN
+
+ sub rN, 4
+ and rN, NOT 3
+ sub rD, rN
+ xor r0, [SRCDAT]
+ add rN, 4
+endm
+
+MY_EPILOG macro crc_end:req
+ sub rN, 4
+ xor r0, [SRCDAT]
+
+ mov rD, rN
+ mov rN, num_VAR
+ sub rN, rD
+ crc_end:
+ test rN, rN
+ jz @F
+ CRC1b
+ jmp crc_end
+ @@:
+ MY_POP_4_REGS
+endm
+
+MY_PROC XzCrc64UpdateT4, 5
+ MY_PROLOG crc_end_4
+ movzx x6, x0_L
+ align 16
+ main_loop_4:
+ mov r3, [SRCDAT]
+ xor r3, r2
+
+ CRC xor, mov, r3, r2, r6, 3
+ movzx x6, x0_H
+ shr r0, 16
+ CRC_XOR r3, r2, r6, 2
+
+ movzx x6, x0_L
+ movzx x0, x0_H
+ CRC_XOR r3, r2, r6, 1
+ CRC_XOR r3, r2, r0, 0
+ movzx x6, x3_L
+ mov r0, r3
+
+ add rD, 4
+ jnz main_loop_4
+
+ MY_EPILOG crc_end_4
+MY_ENDP
+
+endif
+
+end
|