9 changed files with 137 additions and 1 deletions
@ -0,0 +1,20 @@ |
|||
VI_VFP_BASE; |
|||
ZVLDOT_INIT(2); |
|||
|
|||
switch (P.VU.vsew) { |
|||
case 16: { |
|||
if (P.VU.altfmt) { |
|||
// Although this implementation in IEEE 754 arithmetic is valid, most
|
|||
// implementations will bulk-normalize on a VLEN-bit granule, then use
|
|||
// f32_add_bulknorm_odd for the final steps (possibly in a tree).
|
|||
// If a consensus emerges, we might change this implementation.
|
|||
require_extension(EXT_ZVFWLDOT16BF); |
|||
auto macc = [](auto a, auto b, auto c) { return f32_add_bulknorm_odd(c, f32_mul(bf16_to_f32(a), bf16_to_f32(b))); }; |
|||
ZVLDOT_GENERIC_LOOP(bfloat16_t, bfloat16_t, float32_t, macc); |
|||
} else { |
|||
require(false); |
|||
} |
|||
break; |
|||
} |
|||
default: require(false); |
|||
} |
|||
@ -0,0 +1,23 @@ |
|||
ZVLDOT_INIT(4); |
|||
|
|||
switch (P.VU.vsew) { |
|||
case 8: { |
|||
require_extension(EXT_ZVQLDOT8I); |
|||
if (P.VU.altfmt) { |
|||
ZVLDOT_SIMPLE_LOOP(int8_t, int8_t, uint32_t); |
|||
} else { |
|||
ZVLDOT_SIMPLE_LOOP(uint8_t, int8_t, uint32_t); |
|||
} |
|||
break; |
|||
} |
|||
case 16: { |
|||
require_extension(EXT_ZVQLDOT16I); |
|||
if (P.VU.altfmt) { |
|||
ZVLDOT_SIMPLE_LOOP(int16_t, int16_t, uint64_t); |
|||
} else { |
|||
ZVLDOT_SIMPLE_LOOP(uint16_t, int16_t, uint64_t); |
|||
} |
|||
break; |
|||
} |
|||
default: require(false); |
|||
} |
|||
@ -0,0 +1,23 @@ |
|||
ZVLDOT_INIT(4); |
|||
|
|||
switch (P.VU.vsew) { |
|||
case 8: { |
|||
require_extension(EXT_ZVQLDOT8I); |
|||
if (P.VU.altfmt) { |
|||
ZVLDOT_SIMPLE_LOOP(int8_t, uint8_t, uint32_t); |
|||
} else { |
|||
ZVLDOT_SIMPLE_LOOP(uint8_t, uint8_t, uint32_t); |
|||
} |
|||
break; |
|||
} |
|||
case 16: { |
|||
require_extension(EXT_ZVQLDOT16I); |
|||
if (P.VU.altfmt) { |
|||
ZVLDOT_SIMPLE_LOOP(int16_t, uint16_t, uint64_t); |
|||
} else { |
|||
ZVLDOT_SIMPLE_LOOP(uint16_t, uint16_t, uint64_t); |
|||
} |
|||
break; |
|||
} |
|||
default: require(false); |
|||
} |
|||
Loading…
Reference in new issue