/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 * vim: set ts=8 sts=4 et sw=4 tw=99:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "jit/x86-shared/Lowering-x86-shared.h"

#include "mozilla/MathAlgorithms.h"

#include "jit/MIR.h"

#include "jit/shared/Lowering-shared-inl.h"

using namespace js;
using namespace js::jit;

using mozilla::Abs;
using mozilla::FloorLog2;
using mozilla::Swap;

LTableSwitch*
LIRGeneratorX86Shared::newLTableSwitch(const LAllocation& in, const LDefinition& inputCopy,
                                       MTableSwitch* tableswitch)
{
    return new(alloc()) LTableSwitch(in, inputCopy, temp(), tableswitch);
}

LTableSwitchV*
LIRGeneratorX86Shared::newLTableSwitchV(MTableSwitch* tableswitch)
{
    return new(alloc()) LTableSwitchV(useBox(tableswitch->getOperand(0)),
                                      temp(), tempDouble(), temp(), tableswitch);
}

void
LIRGeneratorX86Shared::visitGuardShape(MGuardShape* ins)
{
    MOZ_ASSERT(ins->object()->type() == MIRType::Object);

    LGuardShape* guard = new(alloc()) LGuardShape(useRegisterAtStart(ins->object()));
    assignSnapshot(guard, ins->bailoutKind());
    add(guard, ins);
    redefine(ins, ins->object());
}

void
LIRGeneratorX86Shared::visitGuardObjectGroup(MGuardObjectGroup* ins)
{
    MOZ_ASSERT(ins->object()->type() == MIRType::Object);

    LGuardObjectGroup* guard = new(alloc()) LGuardObjectGroup(useRegisterAtStart(ins->object()));
    assignSnapshot(guard, ins->bailoutKind());
    add(guard, ins);
    redefine(ins, ins->object());
}

void
LIRGeneratorX86Shared::visitPowHalf(MPowHalf* ins)
{
    MDefinition* input = ins->input();
    MOZ_ASSERT(input->type() == MIRType::Double);
    LPowHalfD* lir = new(alloc()) LPowHalfD(useRegisterAtStart(input));
    define(lir, ins);
}

void
LIRGeneratorX86Shared::lowerForShift(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
                                     MDefinition* lhs, MDefinition* rhs)
{
    ins->setOperand(0, useRegisterAtStart(lhs));

    // shift operator should be constant or in register ecx
    // x86 can't shift a non-ecx register
    if (rhs->isConstant())
        ins->setOperand(1, useOrConstantAtStart(rhs));
    else
        ins->setOperand(1, lhs != rhs ? useFixed(rhs, ecx) : useFixedAtStart(rhs, ecx));

    defineReuseInput(ins, mir, 0);
}

template<size_t Temps>
void
LIRGeneratorX86Shared::lowerForShiftInt64(LInstructionHelper<INT64_PIECES, INT64_PIECES + 1, Temps>* ins,
                                          MDefinition* mir, MDefinition* lhs, MDefinition* rhs)
{
    ins->setInt64Operand(0, useInt64RegisterAtStart(lhs));
#if defined(JS_NUNBOX32)
    if (mir->isRotate())
        ins->setTemp(0, temp());
#endif

    static_assert(LShiftI64::Rhs == INT64_PIECES, "Assume Rhs is located at INT64_PIECES.");
    static_assert(LRotateI64::Count == INT64_PIECES, "Assume Count is located at INT64_PIECES.");

    // shift operator should be constant or in register ecx
    // x86 can't shift a non-ecx register
    if (rhs->isConstant()) {
        ins->setOperand(INT64_PIECES, useOrConstantAtStart(rhs));
    } else {
        // The operands are int64, but we only care about the lower 32 bits of
        // the RHS. On 32-bit, the code below will load that part in ecx and
        // will discard the upper half.
        ensureDefined(rhs);
        LUse use(ecx);
        use.setVirtualRegister(rhs->virtualRegister());
        ins->setOperand(INT64_PIECES, use);
    }

    defineInt64ReuseInput(ins, mir, 0);
}

template void LIRGeneratorX86Shared::lowerForShiftInt64(
    LInstructionHelper<INT64_PIECES, INT64_PIECES+1, 0>* ins, MDefinition* mir,
    MDefinition* lhs, MDefinition* rhs);
template void LIRGeneratorX86Shared::lowerForShiftInt64(
    LInstructionHelper<INT64_PIECES, INT64_PIECES+1, 1>* ins, MDefinition* mir,
    MDefinition* lhs, MDefinition* rhs);

void
LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir,
                                   MDefinition* input)
{
    ins->setOperand(0, useRegisterAtStart(input));
    defineReuseInput(ins, mir, 0);
}

void
LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
                                   MDefinition* lhs, MDefinition* rhs)
{
    ins->setOperand(0, useRegisterAtStart(lhs));
    ins->setOperand(1, lhs != rhs ? useOrConstant(rhs) : useOrConstantAtStart(rhs));
    defineReuseInput(ins, mir, 0);
}

template<size_t Temps>
void
LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, Temps>* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs)
{
    // Without AVX, we'll need to use the x86 encodings where one of the
    // inputs must be the same location as the output.
    if (!Assembler::HasAVX()) {
        ins->setOperand(0, useRegisterAtStart(lhs));
        ins->setOperand(1, lhs != rhs ? use(rhs) : useAtStart(rhs));
        defineReuseInput(ins, mir, 0);
    } else {
        ins->setOperand(0, useRegisterAtStart(lhs));
        ins->setOperand(1, useAtStart(rhs));
        define(ins, mir);
    }
}

template void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
                                                 MDefinition* lhs, MDefinition* rhs);
template void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, 1>* ins, MDefinition* mir,
                                                 MDefinition* lhs, MDefinition* rhs);

void
LIRGeneratorX86Shared::lowerForCompIx4(LSimdBinaryCompIx4* ins, MSimdBinaryComp* mir, MDefinition* lhs, MDefinition* rhs)
{
    lowerForALU(ins, mir, lhs, rhs);
}

void
LIRGeneratorX86Shared::lowerForCompFx4(LSimdBinaryCompFx4* ins, MSimdBinaryComp* mir, MDefinition* lhs, MDefinition* rhs)
{
    // Swap the operands around to fit the instructions that x86 actually has.
    // We do this here, before register allocation, so that we don't need
    // temporaries and copying afterwards.
    switch (mir->operation()) {
      case MSimdBinaryComp::greaterThan:
      case MSimdBinaryComp::greaterThanOrEqual:
        mir->reverse();
        Swap(lhs, rhs);
        break;
      default:
        break;
    }

    lowerForFPU(ins, mir, lhs, rhs);
}

void
LIRGeneratorX86Shared::lowerForBitAndAndBranch(LBitAndAndBranch* baab, MInstruction* mir,
                                               MDefinition* lhs, MDefinition* rhs)
{
    baab->setOperand(0, useRegisterAtStart(lhs));
    baab->setOperand(1, useRegisterOrConstantAtStart(rhs));
    add(baab, mir);
}

void
LIRGeneratorX86Shared::lowerMulI(MMul* mul, MDefinition* lhs, MDefinition* rhs)
{
    // Note: If we need a negative zero check, lhs is used twice.
    LAllocation lhsCopy = mul->canBeNegativeZero() ? use(lhs) : LAllocation();
    LMulI* lir = new(alloc()) LMulI(useRegisterAtStart(lhs), useOrConstant(rhs), lhsCopy);
    if (mul->fallible())
        assignSnapshot(lir, Bailout_DoubleOutput);
    defineReuseInput(lir, mul, 0);
}

void
LIRGeneratorX86Shared::lowerDivI(MDiv* div)
{
    if (div->isUnsigned()) {
        lowerUDiv(div);
        return;
    }

    // Division instructions are slow. Division by constant denominators can be
    // rewritten to use other instructions.
    if (div->rhs()->isConstant()) {
        int32_t rhs = div->rhs()->toConstant()->toInt32();

        // Division by powers of two can be done by shifting, and division by
        // other numbers can be done by a reciprocal multiplication technique.
        int32_t shift = FloorLog2(Abs(rhs));
        if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) {
            LAllocation lhs = useRegisterAtStart(div->lhs());
            LDivPowTwoI* lir;
            if (!div->canBeNegativeDividend()) {
                // Numerator is unsigned, so does not need adjusting.
                lir = new(alloc()) LDivPowTwoI(lhs, lhs, shift, rhs < 0);
            } else {
                // Numerator is signed, and needs adjusting, and an extra
                // lhs copy register is needed.
                lir = new(alloc()) LDivPowTwoI(lhs, useRegister(div->lhs()), shift, rhs < 0);
            }
            if (div->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineReuseInput(lir, div, 0);
            return;
        }
        if (rhs != 0) {
            LDivOrModConstantI* lir;
            lir = new(alloc()) LDivOrModConstantI(useRegister(div->lhs()), rhs, tempFixed(eax));
            if (div->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineFixed(lir, div, LAllocation(AnyRegister(edx)));
            return;
        }
    }

    LDivI* lir = new(alloc()) LDivI(useRegister(div->lhs()), useRegister(div->rhs()),
                                    tempFixed(edx));
    if (div->fallible())
        assignSnapshot(lir, Bailout_DoubleOutput);
    defineFixed(lir, div, LAllocation(AnyRegister(eax)));
}

void
LIRGeneratorX86Shared::lowerModI(MMod* mod)
{
    if (mod->isUnsigned()) {
        lowerUMod(mod);
        return;
    }

    if (mod->rhs()->isConstant()) {
        int32_t rhs = mod->rhs()->toConstant()->toInt32();
        int32_t shift = FloorLog2(Abs(rhs));
        if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) {
            LModPowTwoI* lir = new(alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift);
            if (mod->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineReuseInput(lir, mod, 0);
            return;
        }
        if (rhs != 0) {
            LDivOrModConstantI* lir;
            lir = new(alloc()) LDivOrModConstantI(useRegister(mod->lhs()), rhs, tempFixed(edx));
            if (mod->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineFixed(lir, mod, LAllocation(AnyRegister(eax)));
            return;
        }
    }

    LModI* lir = new(alloc()) LModI(useRegister(mod->lhs()),
                                    useRegister(mod->rhs()),
                                    tempFixed(eax));
    if (mod->fallible())
        assignSnapshot(lir, Bailout_DoubleOutput);
    defineFixed(lir, mod, LAllocation(AnyRegister(edx)));
}

void
LIRGeneratorX86Shared::visitWasmSelect(MWasmSelect* ins)
{
    if (ins->type() == MIRType::Int64) {
        auto* lir = new(alloc()) LWasmSelectI64(useInt64RegisterAtStart(ins->trueExpr()),
                                                useInt64(ins->falseExpr()),
                                                useRegister(ins->condExpr()));

        defineInt64ReuseInput(lir, ins, LWasmSelectI64::TrueExprIndex);
        return;
    }

    auto* lir = new(alloc()) LWasmSelect(useRegisterAtStart(ins->trueExpr()),
                                         use(ins->falseExpr()),
                                         useRegister(ins->condExpr()));

    defineReuseInput(lir, ins, LWasmSelect::TrueExprIndex);
}

void
LIRGeneratorX86Shared::visitAsmJSNeg(MAsmJSNeg* ins)
{
    switch (ins->type()) {
      case MIRType::Int32:
        defineReuseInput(new(alloc()) LNegI(useRegisterAtStart(ins->input())), ins, 0);
        break;
      case MIRType::Float32:
        defineReuseInput(new(alloc()) LNegF(useRegisterAtStart(ins->input())), ins, 0);
        break;
      case MIRType::Double:
        defineReuseInput(new(alloc()) LNegD(useRegisterAtStart(ins->input())), ins, 0);
        break;
      default:
        MOZ_CRASH();
    }
}

void
LIRGeneratorX86Shared::lowerWasmLoad(MWasmLoad* ins)
{
    MOZ_ASSERT(ins->type() != MIRType::Int64);

    MDefinition* base = ins->base();
    MOZ_ASSERT(base->type() == MIRType::Int32);

    auto* lir = new(alloc()) LWasmLoad(useRegisterOrZeroAtStart(base));
    define(lir, ins);
}

void
LIRGeneratorX86Shared::lowerUDiv(MDiv* div)
{
    if (div->rhs()->isConstant()) {
        uint32_t rhs = div->rhs()->toConstant()->toInt32();
        int32_t shift = FloorLog2(rhs);

        LAllocation lhs = useRegisterAtStart(div->lhs());
        if (rhs != 0 && uint32_t(1) << shift == rhs) {
            LDivPowTwoI* lir = new(alloc()) LDivPowTwoI(lhs, lhs, shift, false);
            if (div->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineReuseInput(lir, div, 0);
        } else {
            LUDivOrModConstant* lir = new(alloc()) LUDivOrModConstant(useRegister(div->lhs()),
                                                                      rhs, tempFixed(eax));
            if (div->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineFixed(lir, div, LAllocation(AnyRegister(edx)));
        }
        return;
    }

    LUDivOrMod* lir = new(alloc()) LUDivOrMod(useRegister(div->lhs()),
                                              useRegister(div->rhs()),
                                              tempFixed(edx));
    if (div->fallible())
        assignSnapshot(lir, Bailout_DoubleOutput);
    defineFixed(lir, div, LAllocation(AnyRegister(eax)));
}

void
LIRGeneratorX86Shared::lowerUMod(MMod* mod)
{
    if (mod->rhs()->isConstant()) {
        uint32_t rhs = mod->rhs()->toConstant()->toInt32();
        int32_t shift = FloorLog2(rhs);

        if (rhs != 0 && uint32_t(1) << shift == rhs) {
            LModPowTwoI* lir = new(alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift);
            if (mod->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineReuseInput(lir, mod, 0);
        } else {
            LUDivOrModConstant* lir = new(alloc()) LUDivOrModConstant(useRegister(mod->lhs()),
                                                                      rhs, tempFixed(edx));
            if (mod->fallible())
                assignSnapshot(lir, Bailout_DoubleOutput);
            defineFixed(lir, mod, LAllocation(AnyRegister(eax)));
        }
        return;
    }

    LUDivOrMod* lir = new(alloc()) LUDivOrMod(useRegister(mod->lhs()),
                                              useRegister(mod->rhs()),
                                              tempFixed(eax));
    if (mod->fallible())
        assignSnapshot(lir, Bailout_DoubleOutput);
    defineFixed(lir, mod, LAllocation(AnyRegister(edx)));
}

void
LIRGeneratorX86Shared::lowerUrshD(MUrsh* mir)
{
    MDefinition* lhs = mir->lhs();
    MDefinition* rhs = mir->rhs();

    MOZ_ASSERT(lhs->type() == MIRType::Int32);
    MOZ_ASSERT(rhs->type() == MIRType::Int32);
    MOZ_ASSERT(mir->type() == MIRType::Double);

#ifdef JS_CODEGEN_X64
    MOZ_ASSERT(ecx == rcx);
#endif

    LUse lhsUse = useRegisterAtStart(lhs);
    LAllocation rhsAlloc = rhs->isConstant() ? useOrConstant(rhs) : useFixed(rhs, ecx);

    LUrshD* lir = new(alloc()) LUrshD(lhsUse, rhsAlloc, tempCopy(lhs, 0));
    define(lir, mir);
}

void
LIRGeneratorX86Shared::lowerTruncateDToInt32(MTruncateToInt32* ins)
{
    MDefinition* opd = ins->input();
    MOZ_ASSERT(opd->type() == MIRType::Double);

    LDefinition maybeTemp = Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempDouble();
    define(new(alloc()) LTruncateDToInt32(useRegister(opd), maybeTemp), ins);
}

void
LIRGeneratorX86Shared::lowerTruncateFToInt32(MTruncateToInt32* ins)
{
    MDefinition* opd = ins->input();
    MOZ_ASSERT(opd->type() == MIRType::Float32);

    LDefinition maybeTemp = Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempFloat32();
    define(new(alloc()) LTruncateFToInt32(useRegister(opd), maybeTemp), ins);
}

void
LIRGeneratorX86Shared::lowerCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement* ins,
                                                             bool useI386ByteRegisters)
{
    MOZ_ASSERT(ins->arrayType() != Scalar::Float32);
    MOZ_ASSERT(ins->arrayType() != Scalar::Float64);

    MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
    MOZ_ASSERT(ins->index()->type() == MIRType::Int32);

    const LUse elements = useRegister(ins->elements());
    const LAllocation index = useRegisterOrConstant(ins->index());

    // If the target is a floating register then we need a temp at the
    // lower level; that temp must be eax.
    //
    // Otherwise the target (if used) is an integer register, which
    // must be eax.  If the target is not used the machine code will
    // still clobber eax, so just pretend it's used.
    //
    // oldval must be in a register.
    //
    // newval must be in a register.  If the source is a byte array
    // then newval must be a register that has a byte size: on x86
    // this must be ebx, ecx, or edx (eax is taken for the output).
    //
    // Bug #1077036 describes some further optimization opportunities.

    bool fixedOutput = false;
    LDefinition tempDef = LDefinition::BogusTemp();
    LAllocation newval;
    if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
        tempDef = tempFixed(eax);
        newval = useRegister(ins->newval());
    } else {
        fixedOutput = true;
        if (useI386ByteRegisters && ins->isByteArray())
            newval = useFixed(ins->newval(), ebx);
        else
            newval = useRegister(ins->newval());
    }

    const LAllocation oldval = useRegister(ins->oldval());

    LCompareExchangeTypedArrayElement* lir =
        new(alloc()) LCompareExchangeTypedArrayElement(elements, index, oldval, newval, tempDef);

    if (fixedOutput)
        defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
    else
        define(lir, ins);
}

void
LIRGeneratorX86Shared::lowerAtomicExchangeTypedArrayElement(MAtomicExchangeTypedArrayElement* ins,
                                                            bool useI386ByteRegisters)
{
    MOZ_ASSERT(ins->arrayType() <= Scalar::Uint32);

    MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
    MOZ_ASSERT(ins->index()->type() == MIRType::Int32);

    const LUse elements = useRegister(ins->elements());
    const LAllocation index = useRegisterOrConstant(ins->index());
    const LAllocation value = useRegister(ins->value());

    // The underlying instruction is XCHG, which can operate on any
    // register.
    //
    // If the target is a floating register (for Uint32) then we need
    // a temp into which to exchange.
    //
    // If the source is a byte array then we need a register that has
    // a byte size; in this case -- on x86 only -- pin the output to
    // an appropriate register and use that as a temp in the back-end.

    LDefinition tempDef = LDefinition::BogusTemp();
    if (ins->arrayType() == Scalar::Uint32) {
        // This restriction is bug 1077305.
        MOZ_ASSERT(ins->type() == MIRType::Double);
        tempDef = temp();
    }

    LAtomicExchangeTypedArrayElement* lir =
        new(alloc()) LAtomicExchangeTypedArrayElement(elements, index, value, tempDef);

    if (useI386ByteRegisters && ins->isByteArray())
        defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
    else
        define(lir, ins);
}

void
LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop* ins,
                                                         bool useI386ByteRegisters)
{
    MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped);
    MOZ_ASSERT(ins->arrayType() != Scalar::Float32);
    MOZ_ASSERT(ins->arrayType() != Scalar::Float64);

    MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
    MOZ_ASSERT(ins->index()->type() == MIRType::Int32);

    const LUse elements = useRegister(ins->elements());
    const LAllocation index = useRegisterOrConstant(ins->index());

    // Case 1: the result of the operation is not used.
    //
    // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND,
    // LOCK OR, or LOCK XOR.  We can do this even for the Uint32 case.

    if (!ins->hasUses()) {
        LAllocation value;
        if (useI386ByteRegisters && ins->isByteArray() && !ins->value()->isConstant())
            value = useFixed(ins->value(), ebx);
        else
            value = useRegisterOrConstant(ins->value());

        LAtomicTypedArrayElementBinopForEffect* lir =
            new(alloc()) LAtomicTypedArrayElementBinopForEffect(elements, index, value);

        add(lir, ins);
        return;
    }

    // Case 2: the result of the operation is used.
    //
    // For ADD and SUB we'll use XADD:
    //
    //    movl       src, output
    //    lock xaddl output, mem
    //
    // For the 8-bit variants XADD needs a byte register for the output.
    //
    // For AND/OR/XOR we need to use a CMPXCHG loop:
    //
    //    movl          *mem, eax
    // L: mov           eax, temp
    //    andl          src, temp
    //    lock cmpxchg  temp, mem  ; reads eax also
    //    jnz           L
    //    ; result in eax
    //
    // Note the placement of L, cmpxchg will update eax with *mem if
    // *mem does not have the expected value, so reloading it at the
    // top of the loop would be redundant.
    //
    // If the array is not a uint32 array then:
    //  - eax should be the output (one result of the cmpxchg)
    //  - there is a temp, which must have a byte register if
    //    the array has 1-byte elements elements
    //
    // If the array is a uint32 array then:
    //  - eax is the first temp
    //  - we also need a second temp
    //
    // There are optimization opportunities:
    //  - better register allocation in the x86 8-bit case, Bug #1077036.

    bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp);
    bool fixedOutput = true;
    bool reuseInput = false;
    LDefinition tempDef1 = LDefinition::BogusTemp();
    LDefinition tempDef2 = LDefinition::BogusTemp();
    LAllocation value;

    if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
        value = useRegisterOrConstant(ins->value());
        fixedOutput = false;
        if (bitOp) {
            tempDef1 = tempFixed(eax);
            tempDef2 = temp();
        } else {
            tempDef1 = temp();
        }
    } else if (useI386ByteRegisters && ins->isByteArray()) {
        if (ins->value()->isConstant())
            value = useRegisterOrConstant(ins->value());
        else
            value = useFixed(ins->value(), ebx);
        if (bitOp)
            tempDef1 = tempFixed(ecx);
    } else if (bitOp) {
        value = useRegisterOrConstant(ins->value());
        tempDef1 = temp();
    } else if (ins->value()->isConstant()) {
        fixedOutput = false;
        value = useRegisterOrConstant(ins->value());
    } else {
        fixedOutput = false;
        reuseInput = true;
        value = useRegisterAtStart(ins->value());
    }

    LAtomicTypedArrayElementBinop* lir =
        new(alloc()) LAtomicTypedArrayElementBinop(elements, index, value, tempDef1, tempDef2);

    if (fixedOutput)
        defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
    else if (reuseInput)
        defineReuseInput(lir, ins, LAtomicTypedArrayElementBinop::valueOp);
    else
        define(lir, ins);
}

void
LIRGeneratorX86Shared::visitSimdInsertElement(MSimdInsertElement* ins)
{
    MOZ_ASSERT(IsSimdType(ins->type()));

    LUse vec = useRegisterAtStart(ins->vector());
    LUse val = useRegister(ins->value());
    switch (ins->type()) {
      case MIRType::Int8x16:
      case MIRType::Bool8x16:
        // When SSE 4.1 is not available, we need to go via the stack.
        // This requires the value to be inserted to be in %eax-%edx.
        // Pick %ebx since other instructions use %eax or %ecx hard-wired.
#if defined(JS_CODEGEN_X86)
        if (!AssemblerX86Shared::HasSSE41())
            val = useFixed(ins->value(), ebx);
#endif
        defineReuseInput(new(alloc()) LSimdInsertElementI(vec, val), ins, 0);
        break;
      case MIRType::Int16x8:
      case MIRType::Int32x4:
      case MIRType::Bool16x8:
      case MIRType::Bool32x4:
        defineReuseInput(new(alloc()) LSimdInsertElementI(vec, val), ins, 0);
        break;
      case MIRType::Float32x4:
        defineReuseInput(new(alloc()) LSimdInsertElementF(vec, val), ins, 0);
        break;
      default:
        MOZ_CRASH("Unknown SIMD kind when generating constant");
    }
}

void
LIRGeneratorX86Shared::visitSimdExtractElement(MSimdExtractElement* ins)
{
    MOZ_ASSERT(IsSimdType(ins->input()->type()));
    MOZ_ASSERT(!IsSimdType(ins->type()));

    switch (ins->input()->type()) {
      case MIRType::Int8x16:
      case MIRType::Int16x8:
      case MIRType::Int32x4: {
        MOZ_ASSERT(ins->signedness() != SimdSign::NotApplicable);
        LUse use = useRegisterAtStart(ins->input());
        if (ins->type() == MIRType::Double) {
            // Extract an Uint32 lane into a double.
            MOZ_ASSERT(ins->signedness() == SimdSign::Unsigned);
            define(new (alloc()) LSimdExtractElementU2D(use, temp()), ins);
        } else {
            auto* lir = new (alloc()) LSimdExtractElementI(use);
#if defined(JS_CODEGEN_X86)
            // On x86 (32-bit), we may need to use movsbl or movzbl instructions
            // to sign or zero extend the extracted lane to 32 bits. The 8-bit
            // version of these instructions require a source register that is
            // %al, %bl, %cl, or %dl.
            // Fix it to %ebx since we can't express that constraint better.
            if (ins->input()->type() == MIRType::Int8x16) {
                defineFixed(lir, ins, LAllocation(AnyRegister(ebx)));
                return;
            }
#endif
            define(lir, ins);
        }
        break;
      }
      case MIRType::Float32x4: {
        MOZ_ASSERT(ins->signedness() == SimdSign::NotApplicable);
        LUse use = useRegisterAtStart(ins->input());
        define(new(alloc()) LSimdExtractElementF(use), ins);
        break;
      }
      case MIRType::Bool8x16:
      case MIRType::Bool16x8:
      case MIRType::Bool32x4: {
        MOZ_ASSERT(ins->signedness() == SimdSign::NotApplicable);
        LUse use = useRegisterAtStart(ins->input());
        define(new(alloc()) LSimdExtractElementB(use), ins);
        break;
      }
      default:
        MOZ_CRASH("Unknown SIMD kind when extracting element");
    }
}

void
LIRGeneratorX86Shared::visitSimdBinaryArith(MSimdBinaryArith* ins)
{
    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->type()));

    MDefinition* lhs = ins->lhs();
    MDefinition* rhs = ins->rhs();

    if (ins->isCommutative())
        ReorderCommutative(&lhs, &rhs, ins);

    switch (ins->type()) {
      case MIRType::Int8x16: {
          LSimdBinaryArithIx16* lir = new (alloc()) LSimdBinaryArithIx16();
          lir->setTemp(0, LDefinition::BogusTemp());
          lowerForFPU(lir, ins, lhs, rhs);
          return;
      }

      case MIRType::Int16x8: {
          LSimdBinaryArithIx8* lir = new (alloc()) LSimdBinaryArithIx8();
          lir->setTemp(0, LDefinition::BogusTemp());
          lowerForFPU(lir, ins, lhs, rhs);
          return;
      }

      case MIRType::Int32x4: {
          LSimdBinaryArithIx4* lir = new (alloc()) LSimdBinaryArithIx4();
          bool needsTemp =
              ins->operation() == MSimdBinaryArith::Op_mul && !MacroAssembler::HasSSE41();
          lir->setTemp(0, needsTemp ? temp(LDefinition::SIMD128INT) : LDefinition::BogusTemp());
          lowerForFPU(lir, ins, lhs, rhs);
          return;
      }

      case MIRType::Float32x4: {
          LSimdBinaryArithFx4* lir = new (alloc()) LSimdBinaryArithFx4();

          bool needsTemp = ins->operation() == MSimdBinaryArith::Op_max ||
              ins->operation() == MSimdBinaryArith::Op_minNum ||
              ins->operation() == MSimdBinaryArith::Op_maxNum;
          lir->setTemp(0,
                       needsTemp ? temp(LDefinition::SIMD128FLOAT) : LDefinition::BogusTemp());
          lowerForFPU(lir, ins, lhs, rhs);
          return;
      }

      default:
        MOZ_CRASH("unknown simd type on binary arith operation");
    }
}

void
LIRGeneratorX86Shared::visitSimdBinarySaturating(MSimdBinarySaturating* ins)
{
    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->type()));

    MDefinition* lhs = ins->lhs();
    MDefinition* rhs = ins->rhs();

    if (ins->isCommutative())
        ReorderCommutative(&lhs, &rhs, ins);

    LSimdBinarySaturating* lir = new (alloc()) LSimdBinarySaturating();
    lowerForFPU(lir, ins, lhs, rhs);
}

void
LIRGeneratorX86Shared::visitSimdSelect(MSimdSelect* ins)
{
    MOZ_ASSERT(IsSimdType(ins->type()));

    LSimdSelect* lins = new(alloc()) LSimdSelect;
    MDefinition* r0 = ins->getOperand(0);
    MDefinition* r1 = ins->getOperand(1);
    MDefinition* r2 = ins->getOperand(2);

    lins->setOperand(0, useRegister(r0));
    lins->setOperand(1, useRegister(r1));
    lins->setOperand(2, useRegister(r2));
    lins->setTemp(0, temp(LDefinition::SIMD128FLOAT));

    define(lins, ins);
}

void
LIRGeneratorX86Shared::visitSimdSplat(MSimdSplat* ins)
{
    LAllocation x = useRegisterAtStart(ins->getOperand(0));

    switch (ins->type()) {
      case MIRType::Int8x16:
        define(new (alloc()) LSimdSplatX16(x), ins);
        break;
      case MIRType::Int16x8:
        define(new (alloc()) LSimdSplatX8(x), ins);
        break;
      case MIRType::Int32x4:
      case MIRType::Float32x4:
      case MIRType::Bool8x16:
      case MIRType::Bool16x8:
      case MIRType::Bool32x4:
        // Use the SplatX4 instruction for all boolean splats. Since the input
        // value is a 32-bit int that is either 0 or -1, the X4 splat gives
        // the right result for all boolean geometries.
        // For floats, (Non-AVX) codegen actually wants the input and the output
        // to be in the same register, but we can't currently use
        // defineReuseInput because they have different types (scalar vs
        // vector), so a spill slot for one may not be suitable for the other.
        define(new (alloc()) LSimdSplatX4(x), ins);
        break;
      default:
        MOZ_CRASH("Unknown SIMD kind");
    }
}

void
LIRGeneratorX86Shared::visitSimdValueX4(MSimdValueX4* ins)
{
    switch (ins->type()) {
      case MIRType::Float32x4: {
        // Ideally, x would be used at start and reused for the output, however
        // register allocation currently doesn't permit us to tie together two
        // virtual registers with different types.
        LAllocation x = useRegister(ins->getOperand(0));
        LAllocation y = useRegister(ins->getOperand(1));
        LAllocation z = useRegister(ins->getOperand(2));
        LAllocation w = useRegister(ins->getOperand(3));
        LDefinition t = temp(LDefinition::SIMD128FLOAT);
        define(new (alloc()) LSimdValueFloat32x4(x, y, z, w, t), ins);
        break;
      }
      case MIRType::Bool32x4:
      case MIRType::Int32x4: {
        // No defineReuseInput => useAtStart for everyone.
        LAllocation x = useRegisterAtStart(ins->getOperand(0));
        LAllocation y = useRegisterAtStart(ins->getOperand(1));
        LAllocation z = useRegisterAtStart(ins->getOperand(2));
        LAllocation w = useRegisterAtStart(ins->getOperand(3));
        define(new(alloc()) LSimdValueInt32x4(x, y, z, w), ins);
        break;
      }
      default:
        MOZ_CRASH("Unknown SIMD kind");
    }
}

void
LIRGeneratorX86Shared::visitSimdSwizzle(MSimdSwizzle* ins)
{
    MOZ_ASSERT(IsSimdType(ins->input()->type()));
    MOZ_ASSERT(IsSimdType(ins->type()));

    if (IsIntegerSimdType(ins->input()->type())) {
        LUse use = useRegisterAtStart(ins->input());
        LSimdSwizzleI* lir = new (alloc()) LSimdSwizzleI(use);
        define(lir, ins);
        // We need a GPR temp register for pre-SSSE3 codegen (no vpshufb).
        if (Assembler::HasSSSE3()) {
            lir->setTemp(0, LDefinition::BogusTemp());
        } else {
            // The temp must be a GPR usable with 8-bit loads and stores.
#if defined(JS_CODEGEN_X86)
            lir->setTemp(0, tempFixed(ebx));
#else
            lir->setTemp(0, temp());
#endif
        }
    } else if (ins->input()->type() == MIRType::Float32x4) {
        LUse use = useRegisterAtStart(ins->input());
        LSimdSwizzleF* lir = new (alloc()) LSimdSwizzleF(use);
        define(lir, ins);
        lir->setTemp(0, LDefinition::BogusTemp());
    } else {
        MOZ_CRASH("Unknown SIMD kind when getting lane");
    }
}

void
LIRGeneratorX86Shared::visitSimdShuffle(MSimdShuffle* ins)
{
    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->type()));
    if (ins->type() == MIRType::Int32x4 || ins->type() == MIRType::Float32x4) {
        bool zFromLHS = ins->lane(2) < 4;
        bool wFromLHS = ins->lane(3) < 4;
        uint32_t lanesFromLHS = (ins->lane(0) < 4) + (ins->lane(1) < 4) + zFromLHS + wFromLHS;

        LSimdShuffleX4* lir = new (alloc()) LSimdShuffleX4();
        lowerForFPU(lir, ins, ins->lhs(), ins->rhs());

        // See codegen for requirements details.
        LDefinition temp =
          (lanesFromLHS == 3) ? tempCopy(ins->rhs(), 1) : LDefinition::BogusTemp();
        lir->setTemp(0, temp);
    } else {
        MOZ_ASSERT(ins->type() == MIRType::Int8x16 || ins->type() == MIRType::Int16x8);
        LSimdShuffle* lir = new (alloc()) LSimdShuffle();
        lir->setOperand(0, useRegister(ins->lhs()));
        lir->setOperand(1, useRegister(ins->rhs()));
        define(lir, ins);
        // We need a GPR temp register for pre-SSSE3 codegen, and an SSE temp
        // when using pshufb.
        if (Assembler::HasSSSE3()) {
            lir->setTemp(0, temp(LDefinition::SIMD128INT));
        } else {
            // The temp must be a GPR usable with 8-bit loads and stores.
#if defined(JS_CODEGEN_X86)
            lir->setTemp(0, tempFixed(ebx));
#else
            lir->setTemp(0, temp());
#endif
        }
    }
}

void
LIRGeneratorX86Shared::visitSimdGeneralShuffle(MSimdGeneralShuffle* ins)
{
    MOZ_ASSERT(IsSimdType(ins->type()));

    LSimdGeneralShuffleBase* lir;
    if (IsIntegerSimdType(ins->type())) {
#if defined(JS_CODEGEN_X86)
        // The temp register must be usable with 8-bit load and store
        // instructions, so one of %eax-%edx.
        LDefinition t;
        if (ins->type() == MIRType::Int8x16)
            t = tempFixed(ebx);
        else
            t = temp();
#else
        LDefinition t = temp();
#endif
        lir = new (alloc()) LSimdGeneralShuffleI(t);
    } else if (ins->type() == MIRType::Float32x4) {
        lir = new (alloc()) LSimdGeneralShuffleF(temp());
    } else {
        MOZ_CRASH("Unknown SIMD kind when doing a shuffle");
    }

    if (!lir->init(alloc(), ins->numVectors() + ins->numLanes()))
        return;

    for (unsigned i = 0; i < ins->numVectors(); i++) {
        MOZ_ASSERT(IsSimdType(ins->vector(i)->type()));
        lir->setOperand(i, useRegister(ins->vector(i)));
    }

    for (unsigned i = 0; i < ins->numLanes(); i++) {
        MOZ_ASSERT(ins->lane(i)->type() == MIRType::Int32);
        // Note that there can be up to 16 lane arguments, so we can't assume
        // that they all get an allocated register.
        lir->setOperand(i + ins->numVectors(), use(ins->lane(i)));
    }

    assignSnapshot(lir, Bailout_BoundsCheck);
    define(lir, ins);
}

void
LIRGeneratorX86Shared::visitCopySign(MCopySign* ins)
{
    MDefinition* lhs = ins->lhs();
    MDefinition* rhs = ins->rhs();

    MOZ_ASSERT(IsFloatingPointType(lhs->type()));
    MOZ_ASSERT(lhs->type() == rhs->type());
    MOZ_ASSERT(lhs->type() == ins->type());

    LInstructionHelper<1, 2, 2>* lir;
    if (lhs->type() == MIRType::Double)
        lir = new(alloc()) LCopySignD();
    else
        lir = new(alloc()) LCopySignF();

    // As lowerForFPU, but we want rhs to be in a FP register too.
    lir->setOperand(0, useRegisterAtStart(lhs));
    lir->setOperand(1, lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs));
    if (!Assembler::HasAVX())
        defineReuseInput(lir, ins, 0);
    else
        define(lir, ins);
}