Zvk: Infrastructure for Zvk extensions, element group handling

Introduce types and macros useful across multiple Zvk sub-extensions, including Zvbb and Zvbc. Those will be used by upcoming per-sub-extension commits. In particular we introduce "Element Group" types and loop macros handling those element groups. The concept of element group is described in <https://github.com/riscv/riscv-crypto/blob/master/doc/vector/riscv-crypto-vector-element-groups.adoc>. Note that the element group access method is not implemented for WORDS_BIGENDIAN setup. As such, isa_parser.cc is modified to emit an error when WORDS_BIGENDIAN is defined and extensions using element groups are enabled. Signed-off-by: Eric Gouriou <ego@rivosinc.com>
3 years ago · d5c0339484
6 changed files with 1148 additions and 2 deletions
--- a/riscv/arith.h
+++ b/riscv/arith.h
@ -7,6 +7,7 @@
 #include <cstdint>
 #include <climits>
 #include <cstddef>
+#include <type_traits>

 inline uint64_t mulhu(uint64_t a, uint64_t b)
 {
@ -221,4 +222,24 @@ static inline uint64_t xperm(uint64_t rs1, uint64_t rs2, size_t sz_log2, size_t
  return r;
 }

+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_right(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t rshift = shiftamt & mask;
+  const std::size_t lshift = (-rshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
+// Rotates right an unsigned integer by the given number of bits.
+template <typename T>
+static inline T rotate_left(T x, std::size_t shiftamt) {
+  static_assert(std::is_unsigned<T>::value);
+  static constexpr T mask = (8 * sizeof(T)) - 1;
+  const std::size_t lshift = shiftamt & mask;
+  const std::size_t rshift = (-lshift) & mask;
+  return (x << lshift) | (x >> rshift);
+}
+
 #endif
--- a/riscv/isa_parser.cc
+++ b/riscv/isa_parser.cc
@ -361,7 +361,15 @@ isa_parser_t::isa_parser_t(const char* str, const char *priv)
      (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNED] || extension_table[EXT_ZVKSH])) {
    bad_isa_string(str, "'Zvkg', 'Zvkned', and 'Zvksh' extensions are incompatible with 'Zpn' extension in rv64");
  }
-
+#ifdef WORDS_BIGENDIAN
+  // Access to the vector registers as element groups is unimplemented on big-endian setups.
+  if (extension_table[EXT_ZVKG] || extension_table[EXT_ZVKNHA] || extension_table[EXT_ZVKNHB] ||
+      extension_table[EXT_ZVKSED] || extension_table[EXT_ZVKSH]) {
+      bad_isa_string(str,
+		     "'Zvkg', 'Zvkned', 'Zvknha', 'Zvknhb', 'Zvksed', and 'Zvksh' "
+		     "extensions are incompatible with WORDS_BIGENDIAN setups.");
+  }
+#endif
  std::string lowercase = strtolower(priv);
  bool user = false, supervisor = false;

--- a/riscv/v_ext_macros.h
+++ b/riscv/v_ext_macros.h
@ -325,6 +325,10 @@ static inline bool is_overlapped_widen(const int astart, int asize,
  type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);

+#define V_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
 #define VX_U_PARAMS(x) \
  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
  type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
@ -693,6 +697,24 @@ static inline bool is_overlapped_widen(const int astart, int asize,
  } \
  VI_LOOP_END 

+#define VI_V_ULOOP(BODY) \
+  VI_CHECK_SSS(false) \
+  VI_LOOP_BASE \
+  if (sew == e8) { \
+    V_U_PARAMS(e8); \
+    BODY; \
+  } else if (sew == e16) { \
+    V_U_PARAMS(e16); \
+    BODY; \
+  } else if (sew == e32) { \
+    V_U_PARAMS(e32); \
+    BODY; \
+  } else if (sew == e64) { \
+    V_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END
+
 #define VI_VX_ULOOP(BODY) \
  VI_CHECK_SSS(false) \
  VI_LOOP_BASE \
--- a/riscv/vector_unit.cc
+++ b/riscv/vector_unit.cc
@ -86,6 +86,56 @@ template<class T> T& vectorUnit_t::elt(reg_t vReg, reg_t n, bool UNUSED is_write
  return regStart[n];
 }

+// The logic differences between 'elt()' and 'elt_group()' come from
+// the fact that, while 'elt()' requires that the element is fully
+// contained in a single vector register, the element group may span
+// multiple registers in a single register group (LMUL>1).
+//
+// Notes:
+// - We do NOT check that a single element - i.e., the T in the element
+//   group type std::array<T, N> - fits within a single register, or that
+//   T is smaller or equal to VSEW. Implementations of the instructions
+//   sometimes use a different T than what the specification suggests.
+//   Instructon implementations should 'require()' what the specification
+//   dictates.
+// - We do NOT check that 'vReg' is a valid register group, or that
+//   'n+1' element groups fit in the register group 'vReg'. It is
+//   the responsibility of the caller to validate those preconditions.
+template<typename EG> EG&
+vectorUnit_t::elt_group(reg_t vReg, reg_t n, bool UNUSED is_write) {
+#ifdef WORDS_BIGENDIAN
+  fputs("vectorUnit_t::elt_group is not compatible with WORDS_BIGENDIAN setup.\n",
+          stderr);
+  abort();
+#endif
+  using T = typename EG::value_type;
+  constexpr std::size_t N = std::tuple_size<EG>::value;
+  assert(N > 0);
+
+  assert(vsew != 0);
+  constexpr reg_t elt_group_size = N * sizeof(T);
+  const reg_t reg_group_size = (VLEN >> 3) * vflmul;
+  assert(((n + 1) * elt_group_size) <= reg_group_size);
+
+  const reg_t start_byte = n * elt_group_size;
+  const reg_t bytes_per_reg = VLEN >> 3;
+
+  // Inclusive first/last register indices.
+  const reg_t reg_first = vReg + start_byte / bytes_per_reg;
+  const reg_t reg_last = vReg + (start_byte + elt_group_size - 1) / bytes_per_reg;
+
+  // Element groups per register groups
+  for (reg_t vidx = reg_first; vidx <= reg_last; ++vidx) {
+      reg_referenced[vidx] = 1;
+
+      if (unlikely(p->get_log_commits_enabled() && is_write)) {
+          p->get_state()->log_reg_write[(vidx << 4) | 2] = {0, 0};
+      }
+  }
+
+  return *(EG*)((char*)reg_file + vReg * (VLEN >> 3) + start_byte);
+}
+
 template signed char& vectorUnit_t::elt<signed char>(reg_t, reg_t, bool);
 template short& vectorUnit_t::elt<short>(reg_t, reg_t, bool);
 template int& vectorUnit_t::elt<int>(reg_t, reg_t, bool);
@ -98,3 +148,8 @@ template uint64_t& vectorUnit_t::elt<uint64_t>(reg_t, reg_t, bool);
 template float16_t& vectorUnit_t::elt<float16_t>(reg_t, reg_t, bool);
 template float32_t& vectorUnit_t::elt<float32_t>(reg_t, reg_t, bool);
 template float64_t& vectorUnit_t::elt<float64_t>(reg_t, reg_t, bool);
+
+template EGU32x4_t& vectorUnit_t::elt_group<EGU32x4_t>(reg_t, reg_t, bool);
+template EGU32x8_t& vectorUnit_t::elt_group<EGU32x8_t>(reg_t, reg_t, bool);
+template EGU64x4_t& vectorUnit_t::elt_group<EGU64x4_t>(reg_t, reg_t, bool);
+template EGU8x16_t& vectorUnit_t::elt_group<EGU8x16_t>(reg_t, reg_t, bool);
--- a/riscv/vector_unit.h
+++ b/riscv/vector_unit.h
@ -2,6 +2,9 @@
 #ifndef _RISCV_VECTOR_UNIT_H
 #define _RISCV_VECTOR_UNIT_H

+#include <array>
+#include <cstdint>
+
 #include "decode.h"
 #include "csrs.h"

@ -69,6 +72,17 @@ struct type_sew_t<64>
  using type=int64_t;
 };

+// Element Group of 4 32 bits elements (128b total).
+using EGU32x4_t = std::array<uint32_t, 4>;
+
+// Element Group of 8 32 bits elements (256b total).
+using EGU32x8_t = std::array<uint32_t, 8>;
+
+// Element Group of 4 64 bits elements (256b total).
+using EGU64x4_t = std::array<uint64_t, 4>;
+
+// Element Group of 16 8 bits elements (128b total).
+using EGU8x16_t = std::array<uint8_t, 16>;

 class vectorUnit_t
 {
@ -88,8 +102,11 @@ public:
  bool vill;
  bool vstart_alu;

-  // vector element for varies SEW
+  // vector element for various SEW
  template<class T> T& elt(reg_t vReg, reg_t n, bool is_write = false);
+  // vector element group access, where EG is a std::array<T, N>.
+  template<typename EG> EG&
+  elt_group(reg_t vReg, reg_t n, bool is_write = false);

 public:

--- a/riscv/zvk_ext_macros.h
+++ b/riscv/zvk_ext_macros.h