From f2902217b38cf2e16e851ae84d61247f8e828180 Mon Sep 17 00:00:00 2001 From: wolfbeast Date: Mon, 25 Mar 2019 17:53:14 +0100 Subject: Update the 7z installer stub source to 18.05. Tag #1022 --- other-licenses/7zstub/src/Asm/x86/7zAsm.asm | 147 +++ other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm | 147 +++ other-licenses/7zstub/src/Asm/x86/AesOpt.asm | 237 ++++ other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm | 1258 ++++++++++++++++++++++ other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm | 205 ++++ 5 files changed, 1994 insertions(+) create mode 100644 other-licenses/7zstub/src/Asm/x86/7zAsm.asm create mode 100644 other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm create mode 100644 other-licenses/7zstub/src/Asm/x86/AesOpt.asm create mode 100644 other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm create mode 100644 other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm (limited to 'other-licenses/7zstub/src/Asm/x86') diff --git a/other-licenses/7zstub/src/Asm/x86/7zAsm.asm b/other-licenses/7zstub/src/Asm/x86/7zAsm.asm new file mode 100644 index 000000000..8c30d7b7e --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/7zAsm.asm @@ -0,0 +1,147 @@ +; 7zAsm.asm -- ASM macros +; 2018-02-03 : Igor Pavlov : Public domain + +MY_ASM_START macro + ifdef x64 + .code + else + .386 + .model flat + _TEXT$00 SEGMENT PARA PUBLIC 'CODE' + endif +endm + +MY_PROC macro name:req, numParams:req + align 16 + proc_numParams = numParams + ifdef x64 + proc_name equ name + else + proc_name equ @CatStr(@,name,@, %numParams * 4) + endif + proc_name PROC +endm + +MY_ENDP macro + ifdef x64 + ret + else + if proc_numParams LT 3 + ret + else + ret (proc_numParams - 2) * 4 + endif + endif + proc_name ENDP +endm + +ifdef x64 + REG_SIZE equ 8 + REG_LOGAR_SIZE equ 3 +else + REG_SIZE equ 4 + REG_LOGAR_SIZE equ 2 +endif + + x0 equ EAX + x1 equ ECX + x2 equ EDX + x3 equ EBX + x4 equ ESP + x5 equ EBP + x6 equ ESI + x7 equ EDI + + x0_W equ AX + x1_W equ CX + x2_W equ DX + x3_W equ BX + + x5_W equ BP + x6_W equ SI + x7_W equ DI + + x0_L equ AL + x1_L equ CL + x2_L equ DL + x3_L equ BL + + x0_H equ AH + x1_H equ CH + x2_H equ DH + x3_H equ BH + +ifdef x64 + x5_L equ BPL + x6_L equ SIL + x7_L equ DIL + + r0 equ RAX + r1 equ RCX + r2 equ RDX + r3 equ RBX + r4 equ RSP + r5 equ RBP + r6 equ RSI + r7 equ RDI + x8 equ r8d + x9 equ r9d + x10 equ r10d + x11 equ r11d + x12 equ r12d + x13 equ r13d + x14 equ r14d + x15 equ r15d +else + r0 equ x0 + r1 equ x1 + r2 equ x2 + r3 equ x3 + r4 equ x4 + r5 equ x5 + r6 equ x6 + r7 equ x7 +endif + +MY_PUSH_4_REGS macro + push r3 + push r5 + push r6 + push r7 +endm + +MY_POP_4_REGS macro + pop r7 + pop r6 + pop r5 + pop r3 +endm + + +ifdef x64 + +; for WIN64-x64 ABI: + +REG_PARAM_0 equ r1 +REG_PARAM_1 equ r2 +REG_PARAM_2 equ r8 +REG_PARAM_3 equ r9 + +MY_PUSH_PRESERVED_REGS macro + MY_PUSH_4_REGS + push r12 + push r13 + push r14 + push r15 +endm + + +MY_POP_PRESERVED_REGS macro + pop r15 + pop r14 + pop r13 + pop r12 + MY_POP_4_REGS +endm + +endif diff --git a/other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm b/other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm new file mode 100644 index 000000000..2de51719a --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/7zCrcOpt.asm @@ -0,0 +1,147 @@ +; 7zCrcOpt.asm -- CRC32 calculation : optimized version +; 2009-12-12 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + +rD equ r2 +rN equ r7 + +ifdef x64 + num_VAR equ r8 + table_VAR equ r9 +else + data_size equ (REG_SIZE * 5) + crc_table equ (REG_SIZE + data_size) + num_VAR equ [r4 + data_size] + table_VAR equ [r4 + crc_table] +endif + +SRCDAT equ rN + rD + 4 * + +CRC macro op:req, dest:req, src:req, t:req + op dest, DWORD PTR [r5 + src * 4 + 0400h * t] +endm + +CRC_XOR macro dest:req, src:req, t:req + CRC xor, dest, src, t +endm + +CRC_MOV macro dest:req, src:req, t:req + CRC mov, dest, src, t +endm + +CRC1b macro + movzx x6, BYTE PTR [rD] + inc rD + movzx x3, x0_L + xor x6, x3 + shr x0, 8 + CRC xor, x0, r6, 0 + dec rN +endm + +MY_PROLOG macro crc_end:req + MY_PUSH_4_REGS + + mov x0, x1 + mov rN, num_VAR + mov r5, table_VAR + test rN, rN + jz crc_end + @@: + test rD, 7 + jz @F + CRC1b + jnz @B + @@: + cmp rN, 16 + jb crc_end + add rN, rD + mov num_VAR, rN + sub rN, 8 + and rN, NOT 7 + sub rD, rN + xor x0, [SRCDAT 0] +endm + +MY_EPILOG macro crc_end:req + xor x0, [SRCDAT 0] + mov rD, rN + mov rN, num_VAR + sub rN, rD + crc_end: + test rN, rN + jz @F + CRC1b + jmp crc_end + @@: + MY_POP_4_REGS +endm + +MY_PROC CrcUpdateT8, 4 + MY_PROLOG crc_end_8 + mov x1, [SRCDAT 1] + align 16 + main_loop_8: + mov x6, [SRCDAT 2] + movzx x3, x1_L + CRC_XOR x6, r3, 3 + movzx x3, x1_H + CRC_XOR x6, r3, 2 + shr x1, 16 + movzx x3, x1_L + movzx x1, x1_H + CRC_XOR x6, r3, 1 + movzx x3, x0_L + CRC_XOR x6, r1, 0 + + mov x1, [SRCDAT 3] + CRC_XOR x6, r3, 7 + movzx x3, x0_H + shr x0, 16 + CRC_XOR x6, r3, 6 + movzx x3, x0_L + CRC_XOR x6, r3, 5 + movzx x3, x0_H + CRC_MOV x0, r3, 4 + xor x0, x6 + add rD, 8 + jnz main_loop_8 + + MY_EPILOG crc_end_8 +MY_ENDP + +MY_PROC CrcUpdateT4, 4 + MY_PROLOG crc_end_4 + align 16 + main_loop_4: + movzx x1, x0_L + movzx x3, x0_H + shr x0, 16 + movzx x6, x0_H + and x0, 0FFh + CRC_MOV x1, r1, 3 + xor x1, [SRCDAT 1] + CRC_XOR x1, r3, 2 + CRC_XOR x1, r6, 0 + CRC_XOR x1, r0, 1 + + movzx x0, x1_L + movzx x3, x1_H + shr x1, 16 + movzx x6, x1_H + and x1, 0FFh + CRC_MOV x0, r0, 3 + xor x0, [SRCDAT 2] + CRC_XOR x0, r3, 2 + CRC_XOR x0, r6, 0 + CRC_XOR x0, r1, 1 + add rD, 8 + jnz main_loop_4 + + MY_EPILOG crc_end_4 +MY_ENDP + +end diff --git a/other-licenses/7zstub/src/Asm/x86/AesOpt.asm b/other-licenses/7zstub/src/Asm/x86/AesOpt.asm new file mode 100644 index 000000000..c32e48f88 --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/AesOpt.asm @@ -0,0 +1,237 @@ +; AesOpt.asm -- Intel's AES. +; 2009-12-12 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + +ifndef x64 + .xmm +endif + +ifdef x64 + num equ r8 +else + num equ [r4 + REG_SIZE * 4] +endif + +rD equ r2 +rN equ r0 + +MY_PROLOG macro reg:req + ifdef x64 + movdqa [r4 + 8], xmm6 + movdqa [r4 + 8 + 16], xmm7 + endif + + push r3 + push r5 + push r6 + + mov rN, num + mov x6, [r1 + 16] + shl x6, 5 + + movdqa reg, [r1] + add r1, 32 +endm + +MY_EPILOG macro + pop r6 + pop r5 + pop r3 + + ifdef x64 + movdqa xmm6, [r4 + 8] + movdqa xmm7, [r4 + 8 + 16] + endif + + MY_ENDP +endm + +ways equ 4 +ways16 equ (ways * 16) + +OP_W macro op, op2 + i = 0 + rept ways + op @CatStr(xmm,%i), op2 + i = i + 1 + endm +endm + +LOAD_OP macro op:req, offs:req + op xmm0, [r1 + r3 offs] +endm + +LOAD_OP_W macro op:req, offs:req + movdqa xmm7, [r1 + r3 offs] + OP_W op, xmm7 +endm + + +; ---------- AES-CBC Decode ---------- + +CBC_DEC_UPDATE macro reg, offs + pxor reg, xmm6 + movdqa xmm6, [rD + offs] + movdqa [rD + offs], reg +endm + +DECODE macro op:req + op aesdec, +16 + @@: + op aesdec, +0 + op aesdec, -16 + sub x3, 32 + jnz @B + op aesdeclast, +0 +endm + +MY_PROC AesCbc_Decode_Intel, 3 + MY_PROLOG xmm6 + + sub x6, 32 + + jmp check2 + + align 16 + nextBlocks2: + mov x3, x6 + OP_W movdqa, [rD + i * 16] + LOAD_OP_W pxor, +32 + DECODE LOAD_OP_W + OP_W CBC_DEC_UPDATE, i * 16 + add rD, ways16 + check2: + sub rN, ways + jnc nextBlocks2 + + add rN, ways + jmp check + + nextBlock: + mov x3, x6 + movdqa xmm1, [rD] + LOAD_OP movdqa, +32 + pxor xmm0, xmm1 + DECODE LOAD_OP + pxor xmm0, xmm6 + movdqa [rD], xmm0 + movdqa xmm6, xmm1 + add rD, 16 + check: + sub rN, 1 + jnc nextBlock + + movdqa [r1 - 32], xmm6 + MY_EPILOG + + +; ---------- AES-CBC Encode ---------- + +ENCODE macro op:req + op aesenc, -16 + @@: + op aesenc, +0 + op aesenc, +16 + add r3, 32 + jnz @B + op aesenclast, +0 +endm + +MY_PROC AesCbc_Encode_Intel, 3 + MY_PROLOG xmm0 + + add r1, r6 + neg r6 + add r6, 32 + + jmp check_e + + align 16 + nextBlock_e: + mov r3, r6 + pxor xmm0, [rD] + pxor xmm0, [r1 + r3 - 32] + ENCODE LOAD_OP + movdqa [rD], xmm0 + add rD, 16 + check_e: + sub rN, 1 + jnc nextBlock_e + + movdqa [r1 + r6 - 64], xmm0 + MY_EPILOG + + +; ---------- AES-CTR ---------- + +XOR_UPD_1 macro reg, offs + pxor reg, [rD + offs] +endm + +XOR_UPD_2 macro reg, offs + movdqa [rD + offs], reg +endm + +MY_PROC AesCtr_Code_Intel, 3 + MY_PROLOG xmm6 + + mov r5, r4 + shr r5, 4 + dec r5 + shl r5, 4 + + mov DWORD PTR [r5], 1 + mov DWORD PTR [r5 + 4], 0 + mov DWORD PTR [r5 + 8], 0 + mov DWORD PTR [r5 + 12], 0 + + add r1, r6 + neg r6 + add r6, 32 + + jmp check2_c + + align 16 + nextBlocks2_c: + movdqa xmm7, [r5] + + i = 0 + rept ways + paddq xmm6, xmm7 + movdqa @CatStr(xmm,%i), xmm6 + i = i + 1 + endm + + mov r3, r6 + LOAD_OP_W pxor, -32 + ENCODE LOAD_OP_W + OP_W XOR_UPD_1, i * 16 + OP_W XOR_UPD_2, i * 16 + add rD, ways16 + check2_c: + sub rN, ways + jnc nextBlocks2_c + + add rN, ways + jmp check_c + + nextBlock_c: + paddq xmm6, [r5] + mov r3, r6 + movdqa xmm0, [r1 + r3 - 32] + pxor xmm0, xmm6 + ENCODE LOAD_OP + XOR_UPD_1 xmm0, 0 + XOR_UPD_2 xmm0, 0 + add rD, 16 + check_c: + sub rN, 1 + jnc nextBlock_c + + movdqa [r1 + r6 - 64], xmm6 + MY_EPILOG + +end diff --git a/other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm b/other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm new file mode 100644 index 000000000..0a89eb735 --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/LzmaDecOpt.asm @@ -0,0 +1,1258 @@ +; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function +; 2018-02-06: Igor Pavlov : Public domain +; +; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() +; function for check at link time. +; That code is tightly coupled with LzmaDec_TryDummy() +; and with another functions in LzmaDec.c file. +; CLzmaDec structure, (probs) array layout, input and output of +; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). + +ifndef x64 +; x64=1 +; .err +endif + +include 7zAsm.asm + +MY_ASM_START + +_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE' + +MY_ALIGN macro num:req + align num +endm + +MY_ALIGN_16 macro + MY_ALIGN 16 +endm + +MY_ALIGN_32 macro + MY_ALIGN 32 +endm + +MY_ALIGN_64 macro + MY_ALIGN 64 +endm + + +; _LZMA_SIZE_OPT equ 1 + +; _LZMA_PROB32 equ 1 + +ifdef _LZMA_PROB32 + PSHIFT equ 2 + PLOAD macro dest, mem + mov dest, dword ptr [mem] + endm + PSTORE macro src, mem + mov dword ptr [mem], src + endm +else + PSHIFT equ 1 + PLOAD macro dest, mem + movzx dest, word ptr [mem] + endm + PSTORE macro src, mem + mov word ptr [mem], @CatStr(src, _W) + endm +endif + +PMULT equ (1 SHL PSHIFT) +PMULT_HALF equ (1 SHL (PSHIFT - 1)) +PMULT_2 equ (1 SHL (PSHIFT + 1)) + + +; x0 range +; x1 pbPos / (prob) TREE +; x2 probBranch / prm (MATCHED) / pbPos / cnt +; x3 sym +;====== r4 === RSP +; x5 cod +; x6 t1 NORM_CALC / probs_state / dist +; x7 t0 NORM_CALC / prob2 IF_BIT_1 +; x8 state +; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg +; x10 kBitModelTotal_reg +; r11 probs +; x12 offs (MATCHED) / dic / len_temp +; x13 processedPos +; x14 bit (MATCHED) / dicPos +; r15 buf + + +cod equ x5 +cod_L equ x5_L +range equ x0 +state equ x8 +state_R equ r8 +buf equ r15 +processedPos equ x13 +kBitModelTotal_reg equ x10 + +probBranch equ x2 +probBranch_R equ r2 +probBranch_W equ x2_W + +pbPos equ x1 +pbPos_R equ r1 + +cnt equ x2 +cnt_R equ r2 + +lpMask_reg equ x9 +dicPos equ r14 + +sym equ x3 +sym_R equ r3 +sym_L equ x3_L + +probs equ r11 +dic equ r12 + +t0 equ x7 +t0_W equ x7_W +t0_R equ r7 + +prob2 equ t0 +prob2_W equ t0_W + +t1 equ x6 +t1_R equ r6 + +probs_state equ t1 +probs_state_R equ t1_R + +prm equ r2 +match equ x9 +match_R equ r9 +offs equ x12 +offs_R equ r12 +bit equ x14 +bit_R equ r14 + +sym2 equ x9 +sym2_R equ r9 + +len_temp equ x12 + +dist equ sym +dist2 equ x9 + + + +kNumBitModelTotalBits equ 11 +kBitModelTotal equ (1 SHL kNumBitModelTotalBits) +kNumMoveBits equ 5 +kBitModelOffset equ ((1 SHL kNumMoveBits) - 1) +kTopValue equ (1 SHL 24) + +NORM_2 macro + ; movzx t0, BYTE PTR [buf] + shl cod, 8 + mov cod_L, BYTE PTR [buf] + shl range, 8 + ; or cod, t0 + inc buf +endm + + +NORM macro + cmp range, kTopValue + jae SHORT @F + NORM_2 +@@: +endm + + +; ---------- Branch MACROS ---------- + +UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req + mov prob2, kBitModelTotal_reg + sub prob2, probBranch + shr prob2, kNumMoveBits + add probBranch, prob2 + PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req + sub prob2, range + sub cod, range + mov range, prob2 + mov prob2, probBranch + shr probBranch, kNumMoveBits + sub prob2, probBranch + PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +CMP_COD macro probsArray:req, probOffset:req, probDisp:req + PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT + NORM + mov prob2, range + shr range, kNumBitModelTotalBits + imul range, probBranch + cmp cod, range +endm + + +IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jae toLabel +endm + + +IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel + UPDATE_0 probsArray, probOffset, probDisp +endm + + +IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jb toLabel +endm + + +; ---------- CMOV MACROS ---------- + +NORM_CALC macro prob:req + NORM + mov t0, range + shr range, kNumBitModelTotalBits + imul range, prob + sub t0, range + mov t1, cod + sub cod, range +endm + + +PUP macro prob:req, probPtr:req + sub t0, prob + ; only sar works for both 16/32 bit prob modes + sar t0, kNumMoveBits + add t0, prob + PSTORE t0, probPtr +endm + + +PUP_SUB macro prob:req, probPtr:req, symSub:req + sbb sym, symSub + PUP prob, probPtr +endm + + +PUP_COD macro prob:req, probPtr:req, symSub:req + mov t0, kBitModelOffset + cmovb cod, t1 + mov t1, sym + cmovb t0, kBitModelTotal_reg + PUP_SUB prob, probPtr, symSub +endm + + +BIT_0 macro prob:req, probNext:req + PLOAD prob, probs + 1 * PMULT + PLOAD probNext, probs + 1 * PMULT_2 + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 1 * PMULT_2 + PMULT + cmovae probNext, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 2 + PUP_SUB prob, probs + 1 * PMULT, 0 - 1 +endm + + +BIT_1 macro prob:req, probNext:req + PLOAD probNext, probs + sym_R * PMULT_2 + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + sym_R * PMULT + PMULT + cmovae probNext, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1 +endm + + +BIT_2 macro prob:req, symSub:req + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, symSub +endm + + +; ---------- MATCHED LITERAL ---------- + +LITM_0 macro + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, offs + and bit, match + PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT + lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT] + ; lea prm, [probs + 256 * PMULT + 1 * PMULT] + ; add prm, bit_R + xor offs, bit + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 0 + PUP_SUB x1, prm, -2-1 +endm + + +LITM macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + xor offs, bit + add sym, sym + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, - 1 +endm + + +LITM_2 macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + add sym, sym + + NORM_CALC x1 + + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1 +endm + + +; ---------- REVERSE BITS ---------- + +REV_0 macro prob:req, probNext:req + ; PLOAD prob, probs + 1 * PMULT + ; lea sym2_R, [probs + 2 * PMULT] + ; PLOAD probNext, probs + 2 * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 3 * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [probs + 3 * PMULT] + cmovae sym2_R, t1_R + PUP prob, probs + 1 * PMULT +endm + + +REV_1 macro prob:req, probNext:req, step:req + add sym2_R, step * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, sym2_R + step * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [sym2_R + step * PMULT] + cmovae sym2_R, t1_R + PUP prob, t1_R - step * PMULT_2 +endm + + +REV_2 macro prob:req, step:req + sub sym2_R, probs + shr sym2, PSHIFT + or sym, sym2 + + NORM_CALC prob + + cmovae range, t0 + lea t0, [sym - step] + cmovb sym, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + PUP prob, probs + sym2_R * PMULT +endm + + +REV_1_VAR macro prob:req + PLOAD prob, sym_R + mov probs, sym_R + add sym_R, sym2_R + + NORM_CALC prob + + cmovae range, t0 + lea t0_R, [sym_R + sym2_R] + cmovae sym_R, t0_R + mov t0, kBitModelOffset + cmovb cod, t1 + ; mov t1, kBitModelTotal + ; cmovb t0, t1 + cmovb t0, kBitModelTotal_reg + add sym2, sym2 + PUP prob, probs +endm + + + + +LIT_PROBS macro lpMaskParam:req + ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); + mov t0, processedPos + shl t0, 8 + add sym, t0 + and sym, lpMaskParam + add probs_state_R, pbPos_R + mov x1, LOC lc2 + lea sym, dword ptr[sym_R + 2 * sym_R] + add probs, Literal * PMULT + shl sym, x1_L + add probs, sym_R + UPDATE_0 probs_state_R, 0, IsMatch + inc processedPos +endm + + + +kNumPosBitsMax equ 4 +kNumPosStatesMax equ (1 SHL kNumPosBitsMax) + +kLenNumLowBits equ 3 +kLenNumLowSymbols equ (1 SHL kLenNumLowBits) +kLenNumHighBits equ 8 +kLenNumHighSymbols equ (1 SHL kLenNumHighBits) +kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) + +LenLow equ 0 +LenChoice equ LenLow +LenChoice2 equ (LenLow + kLenNumLowSymbols) +LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) + +kNumStates equ 12 +kNumStates2 equ 16 +kNumLitStates equ 7 + +kStartPosModelIndex equ 4 +kEndPosModelIndex equ 14 +kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1)) + +kNumPosSlotBits equ 6 +kNumLenToPosStates equ 4 + +kNumAlignBits equ 4 +kAlignTableSize equ (1 SHL kNumAlignBits) + +kMatchMinLen equ 2 +kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) + +kStartOffset equ 1664 +SpecPos equ (-kStartOffset) +IsRep0Long equ (SpecPos + kNumFullDistances) +RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax)) +LenCoder equ (RepLenCoder + kNumLenProbs) +IsMatch equ (LenCoder + kNumLenProbs) +kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax)) +IsRep equ (kAlign + kAlignTableSize) +IsRepG0 equ (IsRep + kNumStates) +IsRepG1 equ (IsRepG0 + kNumStates) +IsRepG2 equ (IsRepG1 + kNumStates) +PosSlot equ (IsRepG2 + kNumStates) +Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits)) +NUM_BASE_PROBS equ (Literal + kStartOffset) + +if kAlign ne 0 + .err +endif + +if NUM_BASE_PROBS ne 1984 + .err +endif + + +PTR_FIELD equ dq ? + +CLzmaDec_Asm struct + lc db ? + lp db ? + pb db ? + _pad_ db ? + dicSize dd ? + + probs_Spec PTR_FIELD + probs_1664 PTR_FIELD + dic_Spec PTR_FIELD + dicBufSize PTR_FIELD + dicPos_Spec PTR_FIELD + buf_Spec PTR_FIELD + + range_Spec dd ? + code_Spec dd ? + processedPos_Spec dd ? + checkDicSize dd ? + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? + state_Spec dd ? + remainLen dd ? +CLzmaDec_Asm ends + + +CLzmaDec_Asm_Loc struct + OLD_RSP PTR_FIELD + lzmaPtr PTR_FIELD + _pad0_ PTR_FIELD + _pad1_ PTR_FIELD + _pad2_ PTR_FIELD + dicBufSize PTR_FIELD + probs_Spec PTR_FIELD + dic_Spec PTR_FIELD + + limit PTR_FIELD + bufLimit PTR_FIELD + lc2 dd ? + lpMask dd ? + pbMask dd ? + checkDicSize dd ? + + _pad_ dd ? + remainLen dd ? + dicPos_Spec PTR_FIELD + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? +CLzmaDec_Asm_Loc ends + + +GLOB_2 equ [sym_R].CLzmaDec_Asm. +GLOB equ [r1].CLzmaDec_Asm. +LOC_0 equ [r0].CLzmaDec_Asm_Loc. +LOC equ [RSP].CLzmaDec_Asm_Loc. + + +COPY_VAR macro name + mov t0, GLOB_2 name + mov LOC_0 name, t0 +endm + + +RESTORE_VAR macro name + mov t0, LOC name + mov GLOB name, t0 +endm + + + +IsMatchBranch_Pre macro reg + ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; + mov pbPos, LOC pbMask + and pbPos, processedPos + shl pbPos, (kLenNumLowBits + 1 + PSHIFT) + lea probs_state_R, [probs + state_R] +endm + + +IsMatchBranch macro reg + IsMatchBranch_Pre + IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label +endm + + +CheckLimits macro reg + cmp buf, LOC bufLimit + jae fin_OK + cmp dicPos, LOC limit + jae fin_OK +endm + + + +; RSP is (16x + 8) bytes aligned in WIN64-x64 +; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8) + +PARAM_lzma equ REG_PARAM_0 +PARAM_limit equ REG_PARAM_1 +PARAM_bufLimit equ REG_PARAM_2 + +; MY_ALIGN_64 +MY_PROC LzmaDec_DecodeReal_3, 3 +MY_PUSH_PRESERVED_REGS + + lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)] + and r0, -128 + mov r5, RSP + mov RSP, r0 + mov LOC_0 Old_RSP, r5 + mov LOC_0 lzmaPtr, PARAM_lzma + + mov LOC_0 remainLen, 0 ; remainLen must be ZERO + + mov LOC_0 bufLimit, PARAM_bufLimit + mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2 + mov dic, GLOB_2 dic_Spec + add PARAM_limit, dic + mov LOC_0 limit, PARAM_limit + + COPY_VAR(rep0) + COPY_VAR(rep1) + COPY_VAR(rep2) + COPY_VAR(rep3) + + mov dicPos, GLOB_2 dicPos_Spec + add dicPos, dic + mov LOC_0 dicPos_Spec, dicPos + mov LOC_0 dic_Spec, dic + + mov x1_L, GLOB_2 pb + mov t0, 1 + shl t0, x1_L + dec t0 + mov LOC_0 pbMask, t0 + + ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; + ; unsigned lc = p->prop.lc; + ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc); + + mov x1_L, GLOB_2 lc + mov x2, 100h + mov t0, x2 + shr x2, x1_L + ; inc x1 + add x1_L, PSHIFT + mov LOC_0 lc2, x1 + mov x1_L, GLOB_2 lp + shl t0, x1_L + sub t0, x2 + mov LOC_0 lpMask, t0 + mov lpMask_reg, t0 + + ; mov probs, GLOB_2 probs_Spec + ; add probs, kStartOffset SHL PSHIFT + mov probs, GLOB_2 probs_1664 + mov LOC_0 probs_Spec, probs + + mov t0_R, GLOB_2 dicBufSize + mov LOC_0 dicBufSize, t0_R + + mov x1, GLOB_2 checkDicSize + mov LOC_0 checkDicSize, x1 + + mov processedPos, GLOB_2 processedPos_Spec + + mov state, GLOB_2 state_Spec + shl state, PSHIFT + + mov buf, GLOB_2 buf_Spec + mov range, GLOB_2 range_Spec + mov cod, GLOB_2 code_Spec + mov kBitModelTotal_reg, kBitModelTotal + xor sym, sym + + ; if (processedPos != 0 || checkDicSize != 0) + or x1, processedPos + jz @f + + add t0_R, dic + cmp dicPos, dic + cmovnz t0_R, dicPos + movzx sym, byte ptr[t0_R - 1] + +@@: + IsMatchBranch_Pre + cmp state, 4 * PMULT + jb lit_end + cmp state, kNumLitStates * PMULT + jb lit_matched_end + jmp lz_end + + + + +; ---------- LITERAL ---------- +MY_ALIGN_64 +lit_start: + xor state, state +lit_start_2: + LIT_PROBS lpMask_reg + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +lit_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 127 + jbe lit_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + BIT_2 x2, 256 - 1 + + ; mov dic, LOC dic_Spec + mov probs, LOC probs_Spec + IsMatchBranch_Pre + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_end: + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start + + ; jmp IsMatch_label + +; ---------- MATCHES ---------- +; MY_ALIGN_32 +IsMatch_label: + UPDATE_1 probs_state_R, pbPos_R, IsMatch + IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label + + add probs, LenCoder * PMULT + add state, kNumStates * PMULT + +; ---------- LEN DECODE ---------- +len_decode: + mov len_temp, 8 - 1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, (1 SHL (kLenNumLowBits + PSHIFT)) + mov len_temp, -1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT)) + mov sym, 1 + PLOAD x1, probs + 1 * PMULT + +MY_ALIGN_32 +len8_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 64 + jb len8_loop + + mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen + jmp len_mid_2 + +MY_ALIGN_32 +len_mid_0: + UPDATE_0 probs, 0, 0 + add probs, pbPos_R + BIT_0 x2, x1 +len_mid_2: + BIT_1 x1, x2 + BIT_2 x2, len_temp + mov probs, LOC probs_Spec + cmp state, kNumStates * PMULT + jb copy_match + + +; ---------- DECODE DISTANCE ---------- + ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); + + mov t0, 3 + kMatchMinLen + cmp sym, 3 + kMatchMinLen + cmovb t0, sym + add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT)) + shl t0, (kNumPosSlotBits + PSHIFT) + add probs, t0_R + + ; sym = Len + ; mov LOC remainLen, sym + mov len_temp, sym + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +slot_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 32 + jb slot_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + mov x1, sym + BIT_2 x2, 64-1 + + and sym, 3 + mov probs, LOC probs_Spec + cmp x1, 32 + kEndPosModelIndex / 2 + jb short_dist + + ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); + sub x1, (32 + 1 + kNumAlignBits) + ; distance = (2 | (distance & 1)); + or sym, 2 + PLOAD x2, probs + 1 * PMULT + shl sym, kNumAlignBits + 1 + lea sym2_R, [probs + 2 * PMULT] + + jmp direct_norm + ; lea t1, [sym_R + (1 SHL kNumAlignBits)] + ; cmp range, kTopValue + ; jb direct_norm + +; ---------- DIRECT DISTANCE ---------- +MY_ALIGN_32 +direct_loop: + shr range, 1 + mov t0, cod + sub cod, range + cmovs cod, t0 + cmovns sym, t1 + + comment ~ + sub cod, range + mov x2, cod + sar x2, 31 + lea sym, dword ptr [r2 + sym_R * 2 + 1] + and x2, range + add cod, x2 + ~ + dec x1 + je direct_end + + add sym, sym +direct_norm: + lea t1, [sym_R + (1 SHL kNumAlignBits)] + cmp range, kTopValue + jae near ptr direct_loop + ; we align for 32 here with "near ptr" command above + NORM_2 + jmp direct_loop + +MY_ALIGN_32 +direct_end: + ; prob = + kAlign; + ; distance <<= kNumAlignBits; + REV_0 x2, x1 + REV_1 x1, x2, 2 + REV_1 x2, x1, 4 + REV_2 x1, 8 + +decode_dist_end: + + ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) + + mov t0, LOC checkDicSize + test t0, t0 + cmove t0, processedPos + cmp sym, t0 + jae end_of_payload + + ; rep3 = rep2; + ; rep2 = rep1; + ; rep1 = rep0; + ; rep0 = distance + 1; + + inc sym + mov t0, LOC rep0 + mov t1, LOC rep1 + mov x1, LOC rep2 + mov LOC rep0, sym + ; mov sym, LOC remainLen + mov sym, len_temp + mov LOC rep1, t0 + mov LOC rep2, t1 + mov LOC rep3, x1 + + ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + cmp state, (kNumStates + kNumLitStates) * PMULT + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 + + +; ---------- COPY MATCH ---------- +copy_match: + + ; len += kMatchMinLen; + ; add sym, kMatchMinLen + + ; if ((rem = limit - dicPos) == 0) + ; { + ; p->dicPos = dicPos; + ; return SZ_ERROR_DATA; + ; } + mov cnt_R, LOC limit + sub cnt_R, dicPos + jz fin_ERROR + + ; curLen = ((rem < len) ? (unsigned)rem : len); + cmp cnt_R, sym_R + ; cmovae cnt_R, sym_R ; 64-bit + cmovae cnt, sym ; 32-bit + + mov dic, LOC dic_Spec + mov x1, LOC rep0 + + mov t0_R, dicPos + add dicPos, cnt_R + ; processedPos += curLen; + add processedPos, cnt + ; len -= curLen; + sub sym, cnt + mov LOC remainLen, sym + + sub t0_R, dic + + ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); + sub t0_R, r1 + jae @f + + mov r1, LOC dicBufSize + add t0_R, r1 + sub r1, t0_R + cmp cnt_R, r1 + ja copy_match_cross +@@: + ; if (curLen <= dicBufSize - pos) + +; ---------- COPY MATCH FAST ---------- + ; Byte *dest = dic + dicPos; + ; mov r1, dic + ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; + ; sub t0_R, dicPos + ; dicPos += curLen; + + ; const Byte *lim = dest + curLen; + add t0_R, dic + movzx sym, byte ptr[t0_R] + add t0_R, cnt_R + neg cnt_R + ; lea r1, [dicPos - 1] +copy_common: + dec dicPos + ; cmp LOC rep0, 1 + ; je rep0Label + + ; t0_R - src_lim + ; r1 - dest_lim - 1 + ; cnt_R - (-cnt) + + IsMatchBranch_Pre + inc cnt_R + jz copy_end +MY_ALIGN_16 +@@: + mov byte ptr[cnt_R * 1 + dicPos], sym_L + movzx sym, byte ptr[cnt_R * 1 + t0_R] + inc cnt_R + jnz @b + +copy_end: +lz_end_match: + mov byte ptr[dicPos], sym_L + inc dicPos + + ; IsMatchBranch_Pre + CheckLimits +lz_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + + + +; ---------- LITERAL MATCHED ---------- + + LIT_PROBS LOC lpMask + + ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov x1, LOC rep0 + ; mov dic, LOC dic_Spec + mov LOC dicPos_Spec, dicPos + + ; state -= (state < 10) ? 3 : 6; + lea t0, [state_R - 6 * PMULT] + sub state, 3 * PMULT + cmp state, 7 * PMULT + cmovae state, t0 + + sub dicPos, dic + sub dicPos, r1 + jae @f + add dicPos, LOC dicBufSize +@@: + comment ~ + xor t0, t0 + sub dicPos, r1 + cmovb t0_R, LOC dicBufSize + ~ + + movzx match, byte ptr[dic + dicPos * 1] + + ifdef _LZMA_SIZE_OPT + + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, match + mov sym, 1 +MY_ALIGN_16 +litm_loop: + LITM + cmp sym, 256 + jb litm_loop + sub sym, 256 + + else + + LITM_0 + LITM + LITM + LITM + LITM + LITM + LITM + LITM_2 + + endif + + mov probs, LOC probs_Spec + IsMatchBranch_Pre + ; mov dic, LOC dic_Spec + mov dicPos, LOC dicPos_Spec + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_matched_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + ; IsMatchBranch + mov lpMask_reg, LOC lpMask + sub state, 3 * PMULT + jmp lit_start_2 + + + +; ---------- REP 0 LITERAL ---------- +MY_ALIGN_32 +IsRep0Short_label: + UPDATE_0 probs_state_R, pbPos_R, IsRep0Long + + ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov dic, LOC dic_Spec + mov t0_R, dicPos + mov probBranch, LOC rep0 + sub t0_R, dic + + sub probs, RepLenCoder * PMULT + inc processedPos + ; state = state < kNumLitStates ? 9 : 11; + or state, 1 * PMULT + IsMatchBranch_Pre + + sub t0_R, probBranch_R + jae @f + add t0_R, LOC dicBufSize +@@: + movzx sym, byte ptr[dic + t0_R * 1] + jmp lz_end_match + + +MY_ALIGN_32 +IsRep_label: + UPDATE_1 probs_state_R, 0, IsRep + + ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. + ; So we don't check it here. + + ; mov t0, processedPos + ; or t0, LOC checkDicSize + ; jz fin_ERROR_2 + + ; state = state < kNumLitStates ? 8 : 11; + cmp state, kNumLitStates * PMULT + mov state, 8 * PMULT + mov probBranch, 11 * PMULT + cmovae state, probBranch + + ; prob = probs + RepLenCoder; + add probs, RepLenCoder * PMULT + + IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label + UPDATE_1 probs_state_R, pbPos_R, IsRep0Long + jmp len_decode + +MY_ALIGN_32 +IsRepG0_label: + UPDATE_1 probs_state_R, 0, IsRepG0 + mov dist2, LOC rep0 + mov dist, LOC rep1 + mov LOC rep1, dist2 + + IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label + mov LOC rep0, dist + jmp len_decode + +; MY_ALIGN_32 +IsRepG1_label: + UPDATE_1 probs_state_R, 0, IsRepG1 + mov dist2, LOC rep2 + mov LOC rep2, dist + + IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label + mov LOC rep0, dist2 + jmp len_decode + +; MY_ALIGN_32 +IsRepG2_label: + UPDATE_1 probs_state_R, 0, IsRepG2 + mov dist, LOC rep3 + mov LOC rep3, dist2 + mov LOC rep0, dist + jmp len_decode + + + +; ---------- SPEC SHORT DISTANCE ---------- + +MY_ALIGN_32 +short_dist: + sub x1, 32 + 1 + jbe decode_dist_end + or sym, 2 + shl sym, x1_L + lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT] + mov sym2, PMULT ; step +MY_ALIGN_32 +spec_loop: + REV_1_VAR x2 + dec x1 + jnz spec_loop + + mov probs, LOC probs_Spec + sub sym, sym2 + sub sym, SpecPos * PMULT + sub sym_R, probs + shr sym, PSHIFT + + jmp decode_dist_end + + +; ---------- COPY MATCH CROSS ---------- +copy_match_cross: + ; t0_R - src pos + ; r1 - len to dicBufSize + ; cnt_R - total copy len + + mov t1_R, t0_R ; srcPos + mov t0_R, dic + mov r1, LOC dicBufSize ; + neg cnt_R +@@: + movzx sym, byte ptr[t1_R * 1 + t0_R] + inc t1_R + mov byte ptr[cnt_R * 1 + dicPos], sym_L + inc cnt_R + cmp t1_R, r1 + jne @b + + movzx sym, byte ptr[t0_R] + sub t0_R, cnt_R + jmp copy_common + + + + +fin_ERROR: + mov LOC remainLen, len_temp +; fin_ERROR_2: + mov sym, 1 + jmp fin + +end_of_payload: + cmp sym, 0FFFFFFFFh ; -1 + jne fin_ERROR + + mov LOC remainLen, kMatchSpecLenStart + sub state, kNumStates * PMULT + +fin_OK: + xor sym, sym + +fin: + NORM + + mov r1, LOC lzmaPtr + + sub dicPos, LOC dic_Spec + mov GLOB dicPos_Spec, dicPos + mov GLOB buf_Spec, buf + mov GLOB range_Spec, range + mov GLOB code_Spec, cod + shr state, PSHIFT + mov GLOB state_Spec, state + mov GLOB processedPos_Spec, processedPos + + RESTORE_VAR(remainLen) + RESTORE_VAR(rep0) + RESTORE_VAR(rep1) + RESTORE_VAR(rep2) + RESTORE_VAR(rep3) + + mov x0, sym + + mov RSP, LOC Old_RSP + +MY_POP_PRESERVED_REGS +MY_ENDP + +_TEXT$LZMADECOPT ENDS + +end diff --git a/other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm b/other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm new file mode 100644 index 000000000..3e6d49026 --- /dev/null +++ b/other-licenses/7zstub/src/Asm/x86/XzCrc64Opt.asm @@ -0,0 +1,205 @@ +; XzCrc64Opt.asm -- CRC64 calculation : optimized version +; 2011-06-28 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + +ifdef x64 + + rD equ r9 + rN equ r10 + + num_VAR equ r8 + table_VAR equ r9 + + SRCDAT equ rN + rD + +CRC_XOR macro dest:req, src:req, t:req + xor dest, QWORD PTR [r5 + src * 8 + 0800h * t] +endm + +CRC1b macro + movzx x6, BYTE PTR [rD] + inc rD + movzx x3, x0_L + xor x6, x3 + shr r0, 8 + CRC_XOR r0, r6, 0 + dec rN +endm + +MY_PROLOG macro crc_end:req + MY_PUSH_4_REGS + + mov r0, r1 + mov rN, num_VAR + mov r5, table_VAR + mov rD, r2 + test rN, rN + jz crc_end + @@: + test rD, 3 + jz @F + CRC1b + jnz @B + @@: + cmp rN, 8 + jb crc_end + add rN, rD + mov num_VAR, rN + sub rN, 4 + and rN, NOT 3 + sub rD, rN + mov x1, [SRCDAT] + xor r0, r1 + add rN, 4 +endm + +MY_EPILOG macro crc_end:req + sub rN, 4 + mov x1, [SRCDAT] + xor r0, r1 + mov rD, rN + mov rN, num_VAR + sub rN, rD + crc_end: + test rN, rN + jz @F + CRC1b + jmp crc_end + @@: + MY_POP_4_REGS +endm + +MY_PROC XzCrc64UpdateT4, 4 + MY_PROLOG crc_end_4 + align 16 + main_loop_4: + mov x1, [SRCDAT] + movzx x2, x0_L + movzx x3, x0_H + shr r0, 16 + movzx x6, x0_L + movzx x7, x0_H + shr r0, 16 + CRC_XOR r1, r2, 3 + CRC_XOR r0, r3, 2 + CRC_XOR r1, r6, 1 + CRC_XOR r0, r7, 0 + xor r0, r1 + + add rD, 4 + jnz main_loop_4 + + MY_EPILOG crc_end_4 +MY_ENDP + +else + + rD equ r1 + rN equ r7 + + crc_val equ (REG_SIZE * 5) + crc_table equ (8 + crc_val) + table_VAR equ [r4 + crc_table] + num_VAR equ table_VAR + + + SRCDAT equ rN + rD + +CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req + op0 dest0, DWORD PTR [r5 + src * 8 + 0800h * t] + op1 dest1, DWORD PTR [r5 + src * 8 + 0800h * t + 4] +endm + +CRC_XOR macro dest0:req, dest1:req, src:req, t:req + CRC xor, xor, dest0, dest1, src, t +endm + + +CRC1b macro + movzx x6, BYTE PTR [rD] + inc rD + movzx x3, x0_L + xor x6, x3 + shrd r0, r2, 8 + shr r2, 8 + CRC_XOR r0, r2, r6, 0 + dec rN +endm + +MY_PROLOG macro crc_end:req + MY_PUSH_4_REGS + + mov rN, r2 + + mov x0, [r4 + crc_val] + mov x2, [r4 + crc_val + 4] + mov r5, table_VAR + test rN, rN + jz crc_end + @@: + test rD, 3 + jz @F + CRC1b + jnz @B + @@: + cmp rN, 8 + jb crc_end + add rN, rD + + mov num_VAR, rN + + sub rN, 4 + and rN, NOT 3 + sub rD, rN + xor r0, [SRCDAT] + add rN, 4 +endm + +MY_EPILOG macro crc_end:req + sub rN, 4 + xor r0, [SRCDAT] + + mov rD, rN + mov rN, num_VAR + sub rN, rD + crc_end: + test rN, rN + jz @F + CRC1b + jmp crc_end + @@: + MY_POP_4_REGS +endm + +MY_PROC XzCrc64UpdateT4, 5 + MY_PROLOG crc_end_4 + movzx x6, x0_L + align 16 + main_loop_4: + mov r3, [SRCDAT] + xor r3, r2 + + CRC xor, mov, r3, r2, r6, 3 + movzx x6, x0_H + shr r0, 16 + CRC_XOR r3, r2, r6, 2 + + movzx x6, x0_L + movzx x0, x0_H + CRC_XOR r3, r2, r6, 1 + CRC_XOR r3, r2, r0, 0 + movzx x6, x3_L + mov r0, r3 + + add rD, 4 + jnz main_loop_4 + + MY_EPILOG crc_end_4 +MY_ENDP + +endif + +end -- cgit v1.2.3