qemu/target/arm/vec_internal.h

/*
 * ARM AdvSIMD / SVE Vector Helpers
 *
 * Copyright (c) 2020 Linaro
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

#ifndef TARGET_ARM_VEC_INTERNALS_H
#define TARGET_ARM_VEC_INTERNALS_H

/*
 * Note that vector data is stored in host-endian 64-bit chunks,
 * so addressing units smaller than that needs a host-endian fixup.
 *
 * The H<N> macros are used when indexing an array of elements of size N.
 *
 * The H1_<N> macros are used when performing byte arithmetic and then
 * casting the final pointer to a type of size N.
 */
#if HOST_BIG_ENDIAN
#define H1(x)   ((x) ^ 7)
#define H1_2(x) ((x) ^ 6)
#define H1_4(x) ((x) ^ 4)
#define H2(x)   ((x) ^ 3)
#define H4(x)   ((x) ^ 1)
#else
#define H1(x)   (x)
#define H1_2(x) (x)
#define H1_4(x) (x)
#define H2(x)   (x)
#define H4(x)   (x)
#endif
/*
 * Access to 64-bit elements isn't host-endian dependent; we provide H8
 * and H1_8 so that when a function is being generated from a macro we
 * can pass these rather than an empty macro argument, for clarity.
 */
#define H8(x)   (x)
#define H1_8(x) (x)

/* Data for expanding active predicate bits to bytes, for byte elements. */
extern const uint64_t expand_pred_b_data[256];

static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
{
    uint64_t *d = vd + opr_sz;
    uintptr_t i;

    for (i = opr_sz; i < max_sz; i += 8) {
        *d++ = 0;
    }
}

static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
                                    bool round, uint32_t *sat)
{
    if (shift <= -bits) {
        /* Rounding the sign bit always produces 0. */
        if (round) {
            return 0;
        }
        return src >> 31;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < bits) {
        int32_t val = src << shift;
        if (bits == 32) {
            if (!sat || val >> shift == src) {
                return val;
            }
        } else {
            int32_t extval = sextract32(val, 0, bits);
            if (!sat || val == extval) {
                return extval;
            }
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return (1u << (bits - 1)) - (src >= 0);
}

static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
                                     bool round, uint32_t *sat)
{
    if (shift <= -(bits + round)) {
        return 0;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < bits) {
        uint32_t val = src << shift;
        if (bits == 32) {
            if (!sat || val >> shift == src) {
                return val;
            }
        } else {
            uint32_t extval = extract32(val, 0, bits);
            if (!sat || val == extval) {
                return extval;
            }
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return MAKE_64BIT_MASK(0, bits);
}

static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
                                     bool round, uint32_t *sat)
{
    if (sat && src < 0) {
        *sat = 1;
        return 0;
    }
    return do_uqrshl_bhs(src, shift, bits, round, sat);
}

static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
                                  bool round, uint32_t *sat)
{
    if (shift <= -64) {
        /* Rounding the sign bit always produces 0. */
        if (round) {
            return 0;
        }
        return src >> 63;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < 64) {
        int64_t val = src << shift;
        if (!sat || val >> shift == src) {
            return val;
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return src < 0 ? INT64_MIN : INT64_MAX;
}

static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
                                   bool round, uint32_t *sat)
{
    if (shift <= -(64 + round)) {
        return 0;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < 64) {
        uint64_t val = src << shift;
        if (!sat || val >> shift == src) {
            return val;
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return UINT64_MAX;
}

static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
                                   bool round, uint32_t *sat)
{
    if (sat && src < 0) {
        *sat = 1;
        return 0;
    }
    return do_uqrshl_d(src, shift, round, sat);
}

int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);

/*
 * 8 x 8 -> 16 vector polynomial multiply where the inputs are
 * in the low 8 bits of each 16-bit element
*/
uint64_t pmull_h(uint64_t op1, uint64_t op2);
/*
 * 16 x 16 -> 32 vector polynomial multiply where the inputs are
 * in the low 16 bits of each 32-bit element
 */
uint64_t pmull_w(uint64_t op1, uint64_t op2);

#endif /* TARGET_ARM_VEC_INTERNALS_H */
target/arm: Convert aes and sm4 to gvec helpers With this conversion, we will be able to use the same helpers with sve. In particular, pass 3 vector parameters for the 3-operand operations; for advsimd the destination register is also an input. This also fixes a bug in which we failed to clear the high bits of the SVE register after an AdvSIMD operation. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20200514212831.31248-2-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2020-05-14 14:28:26 -07:00			`/*`
			`* ARM AdvSIMD / SVE Vector Helpers`
			`*`
			`* Copyright (c) 2020 Linaro`
			`*`
			`* This library is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
arm tcg cpus: Fix Lesser GPL version number There is no "version 2" of the "Lesser" General Public License. It is either "GPL version 2.0" or "Lesser GPL version 2.1". This patch replaces all occurrences of "Lesser GPL version 2" with "Lesser GPL version 2.1" in comment section. Signed-off-by: Chetan Pant <chetan4windows@gmail.com> Message-Id: <20201023122913.19561-1-chetan4windows@gmail.com> Reviewed-by: Thomas Huth <thuth@redhat.com> Signed-off-by: Thomas Huth <thuth@redhat.com> 2020-10-23 12:29:13 +00:00			`* version 2.1 of the License, or (at your option) any later version.`
target/arm: Convert aes and sm4 to gvec helpers With this conversion, we will be able to use the same helpers with sve. In particular, pass 3 vector parameters for the 3-operand operations; for advsimd the destination register is also an input. This also fixes a bug in which we failed to clear the high bits of the SVE register after an AdvSIMD operation. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20200514212831.31248-2-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2020-05-14 14:28:26 -07:00			`*`
			`* This library is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with this library; if not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#ifndef TARGET_ARM_VEC_INTERNALS_H`
			`#define TARGET_ARM_VEC_INTERNALS_H`

target/arm: Move endian adjustment macros to vec_internal.h We have two copies of these, one set of which is not complete. Move them to a common header. Suggested-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210525010358.152808-82-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2021-05-24 18:03:47 -07:00			`/*`
			`* Note that vector data is stored in host-endian 64-bit chunks,`
			`* so addressing units smaller than that needs a host-endian fixup.`
			`*`
			`* The H<N> macros are used when indexing an array of elements of size N.`
			`*`
			`* The H1_<N> macros are used when performing byte arithmetic and then`
			`* casting the final pointer to a type of size N.`
			`*/`
Replace config-time define HOST_WORDS_BIGENDIAN Replace a config-time define with a compile time condition define (compatible with clang and gcc) that must be declared prior to its usage. This avoids having a global configure time define, but also prevents from bad usage, if the config header wasn't included before. This can help to make some code independent from qemu too. gcc supports __BYTE_ORDER__ from about 4.6 and clang from 3.2. Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com> [ For the s390x parts I'm involved in ] Acked-by: Halil Pasic <pasic@linux.ibm.com> Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-Id: <20220323155743.1585078-7-marcandre.lureau@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2022-03-23 19:57:17 +04:00			`#if HOST_BIG_ENDIAN`
target/arm: Move endian adjustment macros to vec_internal.h We have two copies of these, one set of which is not complete. Move them to a common header. Suggested-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210525010358.152808-82-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2021-05-24 18:03:47 -07:00			`#define H1(x) ((x) ^ 7)`
			`#define H1_2(x) ((x) ^ 6)`
			`#define H1_4(x) ((x) ^ 4)`
			`#define H2(x) ((x) ^ 3)`
			`#define H4(x) ((x) ^ 1)`
			`#else`
			`#define H1(x) (x)`
			`#define H1_2(x) (x)`
			`#define H1_4(x) (x)`
			`#define H2(x) (x)`
			`#define H4(x) (x)`
			`#endif`
target/arm: Provide and use H8 and H1_8 macros Currently we provide Hn and H1_n macros for accessing the correct data within arrays of vector elements of size 1, 2 and 4, accounting for host endianness. We don't provide any macros for elements of size 8 because there the host endianness doesn't matter. However, this does result in awkwardness where we need to pass empty arguments to macros, because checkpatch complains about them. The empty argument is a little confusing for humans to read as well. Add H8() and H1_8() macros and use them where we were previously passing empty arguments to macros. Suggested-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210614151007.4545-2-peter.maydell@linaro.org Message-id: 20210610132505.5827-1-peter.maydell@linaro.org 2021-06-14 16:09:11 +01:00			`/*`
			`* Access to 64-bit elements isn't host-endian dependent; we provide H8`
			`* and H1_8 so that when a function is being generated from a macro we`
			`* can pass these rather than an empty macro argument, for clarity.`
			`*/`
			`#define H8(x) (x)`
			`#define H1_8(x) (x)`
target/arm: Move endian adjustment macros to vec_internal.h We have two copies of these, one set of which is not complete. Move them to a common header. Suggested-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210525010358.152808-82-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2021-05-24 18:03:47 -07:00
target/arm: Move expand_pred_b() data to vec_helper.c For MVE, we want to re-use the large data table from expand_pred_b(). Move the data table to vec_helper.c so it is no longer in an SVE specific source file. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210614151007.4545-14-peter.maydell@linaro.org 2021-06-14 16:09:23 +01:00			`/* Data for expanding active predicate bits to bytes, for byte elements. */`
			`extern const uint64_t expand_pred_b_data[256];`

target/arm: Convert aes and sm4 to gvec helpers With this conversion, we will be able to use the same helpers with sve. In particular, pass 3 vector parameters for the 3-operand operations; for advsimd the destination register is also an input. This also fixes a bug in which we failed to clear the high bits of the SVE register after an AdvSIMD operation. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20200514212831.31248-2-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2020-05-14 14:28:26 -07:00			`static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)`
			`{`
			`uint64_t *d = vd + opr_sz;`
			`uintptr_t i;`

			`for (i = opr_sz; i < max_sz; i += 8) {`
			`*d++ = 0;`
			`}`
			`}`

target/arm: Split out saturating/rounding shifts from neon Split these operations out into a header that can be shared between neon and sve. The "sat" pointer acts both as a boolean for control of saturating behavior and controls the difference in behavior between neon and sve -- QC bit or no QC bit. Widen the shift operand in the new helpers, as the SVE2 insns treat the whole input element as significant. For the neon uses, truncate the shift to int8_t while passing the parameter. Implement right-shift rounding as tmp = src >> (shift - 1); dst = (tmp >> 1) + (tmp & 1); This is the same number of instructions as the current tmp = 1 << (shift - 1); dst = (src + tmp) >> shift; without any possibility of intermediate overflow. Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210525010358.152808-6-richard.henderson@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2021-05-24 18:02:31 -07:00			`static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,`
			`bool round, uint32_t *sat)`
			`{`
			`if (shift <= -bits) {`
			`/* Rounding the sign bit always produces 0. */`
			`if (round) {`
			`return 0;`
			`}`
			`return src >> 31;`
			`} else if (shift < 0) {`
			`if (round) {`
			`src >>= -shift - 1;`
			`return (src >> 1) + (src & 1);`
			`}`
			`return src >> -shift;`
			`} else if (shift < bits) {`
			`int32_t val = src << shift;`
			`if (bits == 32) {`
			`if (!sat \|\| val >> shift == src) {`
			`return val;`
			`}`
			`} else {`
			`int32_t extval = sextract32(val, 0, bits);`
			`if (!sat \|\| val == extval) {`
			`return extval;`
			`}`
			`}`
			`} else if (!sat \|\| src == 0) {`
			`return 0;`
			`}`

			`*sat = 1;`
			`return (1u << (bits - 1)) - (src >= 0);`
			`}`

			`static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,`
			`bool round, uint32_t *sat)`
			`{`
			`if (shift <= -(bits + round)) {`
			`return 0;`
			`} else if (shift < 0) {`
			`if (round) {`
			`src >>= -shift - 1;`
			`return (src >> 1) + (src & 1);`
			`}`
			`return src >> -shift;`
			`} else if (shift < bits) {`
			`uint32_t val = src << shift;`
			`if (bits == 32) {`
			`if (!sat \|\| val >> shift == src) {`
			`return val;`
			`}`
			`} else {`
			`uint32_t extval = extract32(val, 0, bits);`
			`if (!sat \|\| val == extval) {`
			`return extval;`
			`}`
			`}`
			`} else if (!sat \|\| src == 0) {`
			`return 0;`
			`}`

			`*sat = 1;`
			`return MAKE_64BIT_MASK(0, bits);`
			`}`

			`static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,`
			`bool round, uint32_t *sat)`
			`{`
			`if (sat && src < 0) {`
			`*sat = 1;`
			`return 0;`
			`}`
			`return do_uqrshl_bhs(src, shift, bits, round, sat);`
			`}`

			`static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,`
			`bool round, uint32_t *sat)`
			`{`
			`if (shift <= -64) {`
			`/* Rounding the sign bit always produces 0. */`
			`if (round) {`
			`return 0;`
			`}`
			`return src >> 63;`
			`} else if (shift < 0) {`
			`if (round) {`
			`src >>= -shift - 1;`
			`return (src >> 1) + (src & 1);`
			`}`
			`return src >> -shift;`
			`} else if (shift < 64) {`
			`int64_t val = src << shift;`
			`if (!sat \|\| val >> shift == src) {`
			`return val;`
			`}`
			`} else if (!sat \|\| src == 0) {`
			`return 0;`
			`}`

			`*sat = 1;`
			`return src < 0 ? INT64_MIN : INT64_MAX;`
			`}`

			`static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,`
			`bool round, uint32_t *sat)`
			`{`
			`if (shift <= -(64 + round)) {`
			`return 0;`
			`} else if (shift < 0) {`
			`if (round) {`
			`src >>= -shift - 1;`
			`return (src >> 1) + (src & 1);`
			`}`
			`return src >> -shift;`
			`} else if (shift < 64) {`
			`uint64_t val = src << shift;`
			`if (!sat \|\| val >> shift == src) {`
			`return val;`
			`}`
			`} else if (!sat \|\| src == 0) {`
			`return 0;`
			`}`

			`*sat = 1;`
			`return UINT64_MAX;`
			`}`

			`static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,`
			`bool round, uint32_t *sat)`
			`{`
			`if (sat && src < 0) {`
			`*sat = 1;`
			`return 0;`
			`}`
			`return do_uqrshl_d(src, shift, round, sat);`
			`}`

target/arm: Implement SVE2 complex integer multiply-add Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210525010358.152808-38-richard.henderson@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2021-05-24 18:03:03 -07:00			`int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);`
			`int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);`
			`int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);`
			`int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);`

target/arm: Implement MVE VMULL (polynomial) Implement the MVE VMULL (polynomial) insn. Unlike Neon, this comes in two flavours: 8x8->16 and a 16x16->32. Also unlike Neon, the inputs are in either the low or the high half of each double-width element. The assembler for this insn indicates the size with "P8" or "P16", encoded into bit 28 as size = 0 or 1. We choose to follow the same encoding as VQDMULL and decode this into a->size as MO_16 or MO_32 indicating the size of the result elements. This then carries through to the helper function names where it then matches up with the existing pmull_h() which does an 8x8->16 operation and a new pmull_w() which does the 16x16->32. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> 2021-08-13 17:11:50 +01:00			`/*`
			`* 8 x 8 -> 16 vector polynomial multiply where the inputs are`
			`* in the low 8 bits of each 16-bit element`
			`*/`
			`uint64_t pmull_h(uint64_t op1, uint64_t op2);`
			`/*`
			`* 16 x 16 -> 32 vector polynomial multiply where the inputs are`
			`* in the low 16 bits of each 32-bit element`
			`*/`
			`uint64_t pmull_w(uint64_t op1, uint64_t op2);`

target/arm: Convert aes and sm4 to gvec helpers With this conversion, we will be able to use the same helpers with sve. In particular, pass 3 vector parameters for the 3-operand operations; for advsimd the destination register is also an input. This also fixes a bug in which we failed to clear the high bits of the SVE register after an AdvSIMD operation. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20200514212831.31248-2-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> 2020-05-14 14:28:26 -07:00			`#endif /* TARGET_ARM_VEC_INTERNALS_H */`