/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// Platform specific code to invoke XPCOM methods on native objects

#include "xptcprivate.h"

// 6 integral parameters are passed in registers
const uint32_t GPR_COUNT = 6;

// 8 floating point parameters are passed in SSE registers
const uint32_t FPR_COUNT = 8;

// Remember that these 'words' are 64-bit long
static inline void
invoke_count_words(uint32_t paramCount, nsXPTCVariant * s,
                   uint32_t & nr_stack)
{
    uint32_t nr_gpr;
    uint32_t nr_fpr;
    nr_gpr = 1; // skip one GP register for 'that'
    nr_fpr = 0;
    nr_stack = 0;

    /* Compute number of eightbytes of class MEMORY.  */
    for (uint32_t i = 0; i < paramCount; i++, s++) {
        if (!s->IsPtrData()
            && (s->type == nsXPTType::T_FLOAT || s->type == nsXPTType::T_DOUBLE)) {
            if (nr_fpr < FPR_COUNT)
                nr_fpr++;
            else
                nr_stack++;
        }
        else {
            if (nr_gpr < GPR_COUNT)
                nr_gpr++;
            else
                nr_stack++;
        }
    }
}

static void
invoke_copy_to_stack(uint64_t * d, uint32_t paramCount, nsXPTCVariant * s,
                     uint64_t * gpregs, double * fpregs)
{
    uint32_t nr_gpr = 1u; // skip one GP register for 'that'
    uint32_t nr_fpr = 0u;
    uint64_t value = 0u;

    for (uint32_t i = 0; i < paramCount; i++, s++) {
        if (s->IsPtrData())
            value = (uint64_t) s->ptr;
        else {
            switch (s->type) {
            case nsXPTType::T_FLOAT:                                break;
            case nsXPTType::T_DOUBLE:                               break;
            case nsXPTType::T_I8:     value = s->val.i8;            break;
            case nsXPTType::T_I16:    value = s->val.i16;           break;
            case nsXPTType::T_I32:    value = s->val.i32;           break;
            case nsXPTType::T_I64:    value = s->val.i64;           break;
            case nsXPTType::T_U8:     value = s->val.u8;            break;
            case nsXPTType::T_U16:    value = s->val.u16;           break;
            case nsXPTType::T_U32:    value = s->val.u32;           break;
            case nsXPTType::T_U64:    value = s->val.u64;           break;
            case nsXPTType::T_BOOL:   value = s->val.b;             break;
            case nsXPTType::T_CHAR:   value = s->val.c;             break;
            case nsXPTType::T_WCHAR:  value = s->val.wc;            break;
            default:                  value = (uint64_t) s->val.p;  break;
            }
        }

        if (!s->IsPtrData() && s->type == nsXPTType::T_DOUBLE) {
            if (nr_fpr < FPR_COUNT)
                fpregs[nr_fpr++] = s->val.d;
            else {
                *((double *)d) = s->val.d;
                d++;
            }
        }
        else if (!s->IsPtrData() && s->type == nsXPTType::T_FLOAT) {
            if (nr_fpr < FPR_COUNT)
                // The value in %xmm register is already prepared to
                // be retrieved as a float. Therefore, we pass the
                // value verbatim, as a double without conversion.
                fpregs[nr_fpr++] = s->val.d;
            else {
                *((float *)d) = s->val.f;
                d++;
            }
        }
        else {
            if (nr_gpr < GPR_COUNT)
                gpregs[nr_gpr++] = value;
            else
                *d++ = value;
        }
    }
}

// Disable avx for the next function to allow compilation with
// -march=native on new machines, or similar hardcoded -march options.
// Having avx enabled appears to change the alignment behavior of alloca
// (apparently adding an extra 16 bytes) of padding/alignment (and using
// 32-byte alignment instead of 16-byte).  This seems to be the best
// available workaround, given that this code, which should perhaps
// better be written in assembly, is written in C++.
#ifndef __clang__
#pragma GCC push_options
#pragma GCC target ("no-avx")
#endif

// Avoid AddressSanitizer instrumentation for the next function because it
// depends on __builtin_alloca behavior and alignment that cannot be relied on
// once the function is compiled with a version of ASan that has dynamic-alloca
// instrumentation enabled.

MOZ_ASAN_BLACKLIST
EXPORT_XPCOM_API(nsresult)
NS_InvokeByIndex(nsISupports * that, uint32_t methodIndex,
                 uint32_t paramCount, nsXPTCVariant * params)
{
    uint32_t nr_stack;
    invoke_count_words(paramCount, params, nr_stack);
    
    // Stack, if used, must be 16-bytes aligned
    if (nr_stack)
        nr_stack = (nr_stack + 1) & ~1;

    // Load parameters to stack, if necessary
    uint64_t *stack = (uint64_t *) __builtin_alloca(nr_stack * 8);
    uint64_t gpregs[GPR_COUNT];
    double fpregs[FPR_COUNT];
    invoke_copy_to_stack(stack, paramCount, params, gpregs, fpregs);

    // We used to have switches to make sure we would only load the registers
    // that are needed for this call. That produced larger code that was
    // not faster in practice. It also caused compiler warnings about the
    // variables being used uninitialized.
    // We now just load every every register. There could still be a warning
    // from a memory analysis tools that we are loading uninitialized stack
    // positions.

    // FIXME: this function depends on the above __builtin_alloca placing
    // the array in the correct spot for the ABI.

    // Load FPR registers from fpregs[]
    double d0, d1, d2, d3, d4, d5, d6, d7;

    d7 = fpregs[7];
    d6 = fpregs[6];
    d5 = fpregs[5];
    d4 = fpregs[4];
    d3 = fpregs[3];
    d2 = fpregs[2];
    d1 = fpregs[1];
    d0 = fpregs[0];

    // Load GPR registers from gpregs[]
    uint64_t a0, a1, a2, a3, a4, a5;

    a5 = gpregs[5];
    a4 = gpregs[4];
    a3 = gpregs[3];
    a2 = gpregs[2];
    a1 = gpregs[1];
    a0 = (uint64_t) that;

    // Get pointer to method
    uint64_t methodAddress = *((uint64_t *)that);
    methodAddress += 8 * methodIndex;
    methodAddress = *((uint64_t *)methodAddress);
    
    typedef nsresult (*Method)(uint64_t, uint64_t, uint64_t, uint64_t,
                               uint64_t, uint64_t, double, double, double,
                               double, double, double, double, double);
    nsresult result = ((Method)methodAddress)(a0, a1, a2, a3, a4, a5,
                                              d0, d1, d2, d3, d4, d5,
                                              d6, d7);
    return result;
}

#ifndef __clang__
#pragma GCC pop_options
#endif