From 22642cb32dbd74f954f81623621d99c1251a6e81 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 16 Oct 2024 11:34:44 +0900 Subject: [PATCH 01/10] [test] Implement stricter Memory checks --- test/test_by_xed.py | 76 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/test/test_by_xed.py b/test/test_by_xed.py index 1e84c6ae..13d695c1 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -76,7 +76,7 @@ def newReg(s): return s class Memory: - def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=False): + def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=0): self.size = size self.base = newReg(base) self.index = newReg(index) @@ -85,8 +85,12 @@ def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=Fal self.broadcast = broadcast def __str__(self): - s = 'ptr' if self.size == 0 else g_sizeTbl[int(math.log2(self.size))] - if self.broadcast: + if self.size == 0: + s = 'ptr' + else: + idx = self.size * max(self.broadcast, 1) + s = g_sizeTbl[int(math.log2(idx))] + if self.broadcast > 0: s += '_b' s += ' [' needPlus = False @@ -107,23 +111,36 @@ def __str__(self): s += ']' return s + # Xbyak uses 'ptr' when it can be automatically detected, so we should consider this in the comparison. def __eq__(self, rhs): - # xbyak uses ptr if it is automatically detected, so xword == ptr is true - if self.broadcast != rhs.broadcast: return False -# if not self.broadcast and 0 < self.size <= 8 and 0 < rhs.size <= 8 and self.size != rhs.size: return False - if not self.broadcast and self.size > 0 and rhs.size > 0 and self.size != rhs.size: return False + if self.broadcast > rhs.broadcast: + return rhs == self + assert(self.broadcast <= rhs.broadcast) + if self.broadcast == 0: + if rhs.broadcast > 0: return False + # Xbyak uses 'ptr' when it is automatically detected. + # Therefore, the comparison is true if 'ptr' (i.e., size = 0) is used. + if 0 < self.size and 0 < rhs.size and self.size != rhs.size: return False + if self.broadcast == 1: # _b + if rhs.broadcast == 1: # compare ptr_b with ptr_b + if self.size != rhs.size: + return False + if self.size > 0 and (self.size != rhs.size * rhs.broadcast): # compare ptr_b with {1toX} + return False + else: + if self.broadcast != rhs.broadcast: return False r = self.base == rhs.base and self.index == rhs.index and self.scale == rhs.scale and self.disp == rhs.disp return r def parseBroadcast(s): if '_b' in s: - return (s.replace('_b', ''), True) - r = re.search(r'({1to\d+})', s) + return (s.replace('_b', ''), 1) + r = re.search(r'({1to(\d+)})', s) if not r: - return (s, False) - return (s.replace(r.group(1), ''), True) + return (s, 0) + return (s.replace(r.group(1), ''), int(r.group(2))) -def parseMemory(s, broadcast=False): +def parseMemory(s, broadcast=0): org_s = s s = s.replace(' ', '').lower() @@ -133,7 +150,7 @@ def parseMemory(s, broadcast=False): scale = 0 disp = 0 - if not broadcast: + if broadcast == 0: (s, broadcast) = parseBroadcast(s) # Parse size @@ -157,7 +174,7 @@ def parseMemory(s, broadcast=False): s = s[3:] if s.startswith('_b'): - broadcast = True + broadcast = 1 s = s[2:] # Extract the content inside brackets @@ -335,7 +352,7 @@ def parseMemoryTest(): ('[]', Memory()), ('[rax]', Memory(0, rax)), ('ptr[rax]', Memory(0, rax)), - ('ptr_b[rax]', Memory(0, rax, broadcast=True)), + ('ptr_b[rax]', Memory(0, rax, broadcast=1)), ('dword[rbx]', Memory(4, rbx)), ('xword ptr[rcx]', Memory(16, rcx)), ('xmmword ptr[rcx]', Memory(16, rcx)), @@ -344,11 +361,36 @@ def parseMemoryTest(): ('[0x12345]', Memory(0, None, None, 0, 0x12345)), ('yword [rax+rdx*4]', Memory(32, rax, rdx, 4)), ('zword [rax+rdx*4+123]', Memory(64, rax, rdx, 4, 123)), + ('xword_b [rax]', Memory(16, rax, None, 0, 0, 1)), + ('dword [rax]{1to4}', Memory(16, rax, None, 0, 0, 1)), + ('yword_b [rax]', Memory(32, rax, None, 0, 0, 1)), + ('dword [rax]{1to8}', Memory(32, rax, None, 0, 0, 1)), ] for (s, expected) in tbl: my = parseMemory(s) assertEqualStr(my, expected) + print('compare test') + tbl = [ + ('ptr[rax]', 'dword[rax]', True), + ('byte[rax]', 'dword[rax]', False), + ('yword_b[rax]', 'dword [rax]{1to8}', True), + ('yword_b[rax]', 'word [rax]{1to16}', True), + ('zword_b[rax]', 'word [rax]{1to32}', True), + ('zword_b[rax]', 'word [rax]{1to16}', False), + ('dword [rax]{1to2}', 'dword [rax] {1to4}', False), + ('zword_b[rax]', 'xword_b [rax]', False), + ('ptr_b[rax]', 'word [rax]{1to32}', True), # ignore size + ] + for (lhs, rhs, eq) in tbl: + a = parseMemory(lhs) + b = parseMemory(rhs) + if eq: + assertEqual(a, b) + assertEqual(b, a) + else: + assert(parseMemory(lhs) != parseMemory(rhs)) + def parseNmemonicTest(): print('parseNmemonicTest') tbl = [ @@ -364,8 +406,8 @@ def parseNmemonicTest(): ('vpcompressw(zmm30 | k2 |T_z, zmm1);', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), ('vpcompressw zmm30{k2}{z}, zmm1', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])), - ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), - ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), + ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, 1), 5], [k3, T_z])), + ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, 4), 5], [k3, T_z])), ('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])), ] for (s, expected) in tbl: From c5704a21d910b5d59023afb3b6ed13eca04e65ab Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Oct 2024 14:29:57 +0900 Subject: [PATCH 02/10] Updated to comply with AVX10.2 specification rev 2.0 --- gen/gen_avx512.cpp | 14 +++++++------- xbyak/xbyak_mnemonic.h | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index e4d319ee..293cb5bc 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -202,13 +202,13 @@ void putX_XM() { 0x2F, "vcomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, { 0x2E, "vucomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, - { 0x2F, "vcomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, - { 0x2F, "vcomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, - { 0x2F, "vcomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, + { 0x2F, "vcomxsd", T_MUST_EVEX | T_F2 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2F, "vcomxsh", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2F, "vcomxss", T_MUST_EVEX | T_F3 | T_0F | T_EW0 | T_SAE_X | T_N4 }, - { 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, - { 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, - { 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, + { 0x2E, "vucomxsd", T_MUST_EVEX | T_F2 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2E, "vucomxsh", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2E, "vucomxss", T_MUST_EVEX | T_F3 | T_0F | T_EW0 | T_SAE_X | T_N4 }, // 13.1 { 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, @@ -893,7 +893,7 @@ void putX_XM_IMM() { 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false }, { 0x2F, "vcomsbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false }, - { 0x42, "vgetexppbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x42, "vgetexppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, { 0x26, "vgetmantpbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, { 0x4C, "vrcppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, { 0x56, "vreducenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 087db031..d77daeca 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2186,9 +2186,9 @@ void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); } void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); } -void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); } -void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } -void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); } void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } @@ -2372,7 +2372,7 @@ void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8|T_66|T_0 void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); } void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); } void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); } -void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); } +void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); } void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); } @@ -2656,9 +2656,9 @@ void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); } void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); } void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } -void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); } -void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } -void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } #ifdef XBYAK64 void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); } From d79ad49523f3467f83c80e3f040bd8784541145d Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Oct 2024 14:44:12 +0900 Subject: [PATCH 03/10] Suppress AVX10.2 rev.2 tests in xed_test until xed is updated --- test/Makefile | 3 ++- test/avx10/bf16.txt | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/test/Makefile b/test/Makefile index cf5c7163..a61895fd 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,7 +60,8 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt +#TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt misc.txt convert.txt minmax.txt saturation.txt xed_test: @set -e; \ for target in $(addprefix avx10/, $(TEST_FILES)); do \ diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt index c544e02c..a387c610 100644 --- a/test/avx10/bf16.txt +++ b/test/avx10/bf16.txt @@ -113,17 +113,17 @@ vfpclasspbf16(k7|k5, zword_b[rax+128], 13); vcomsbf16(xm2, xm3); vcomsbf16(xm2, ptr[rax+128]); -vgetexppbf16(xm1|k3, xmm2); -vgetexppbf16(xm1|k3, ptr[rax+128]); -vgetexppbf16(xm1|k3, ptr_b[rax+128]); +//vgetexppbf16(xm1|k3, xmm2); +//vgetexppbf16(xm1|k3, ptr[rax+128]); +//vgetexppbf16(xm1|k3, ptr_b[rax+128]); -vgetexppbf16(ym1|k3, ymm2); -vgetexppbf16(ym1|k3, ptr[rax+128]); -vgetexppbf16(ym1|k3, ptr_b[rax+128]); +//vgetexppbf16(ym1|k3, ymm2); +//vgetexppbf16(ym1|k3, ptr[rax+128]); +//vgetexppbf16(ym1|k3, ptr_b[rax+128]); -vgetexppbf16(zm1|k3, zmm2); -vgetexppbf16(zm1|k3, ptr[rax+128]); -vgetexppbf16(zm1|k3, ptr_b[rax+128]); +//vgetexppbf16(zm1|k3, zmm2); +//vgetexppbf16(zm1|k3, ptr[rax+128]); +//vgetexppbf16(zm1|k3, ptr_b[rax+128]); vgetmantpbf16(xm1|k3, xmm2, 3); vgetmantpbf16(xm1|k3, ptr[rax+128], 5); From d7ed9fdefd39f0e3829758d8135f74aa4bd426e2 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Oct 2024 18:31:28 +0900 Subject: [PATCH 04/10] [skip ci] [doc] reorder the paragraph --- doc/usage.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index b8073cea..132020ae 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -110,6 +110,15 @@ vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit ``` +### Remark +* `k1`, ..., `k7` are opmask registers. + - `k0` is dealt as no mask. + - e.g. `vmovaps(zmm0|k0, ptr[rax]);` and `vmovaps(zmm0|T_z, ptr[rax]);` are same to `vmovaps(zmm0, ptr[rax]);`. +* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively. +* `k4 | k3` is different from `k3 | k4`. +* use `ptr_b` for broadcast `{1toX}`. X is automatically determined. +* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary. + ## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2. Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2. The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. @@ -151,15 +160,6 @@ feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2 At first, I attempted to use EvexEncoding (resp. VexEncoding) instead of AVX10v2Encoding (resp. EvexEncoding) for `setDefaultEncodingAVX10`. But I abandoned this idea when I found that `vmovd` and `vmovw` had different EVEX encodings in AVX512 and AVX10.2 -### Remark -* `k1`, ..., `k7` are opmask registers. - - `k0` is dealt as no mask. - - e.g. `vmovaps(zmm0|k0, ptr[rax]);` and `vmovaps(zmm0|T_z, ptr[rax]);` are same to `vmovaps(zmm0, ptr[rax]);`. -* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively. -* `k4 | k3` is different from `k3 | k4`. -* use `ptr_b` for broadcast `{1toX}`. X is automatically determined. -* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary. - ## APX [Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html) - Support 64-bit 16 additional GPRs (general-purpose registers) r16, ..., r31 From 4e2efab94ef6302cce5f4f5b6eaec3312cd4eb00 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sat, 19 Oct 2024 08:54:22 +0900 Subject: [PATCH 05/10] The encoding of vmovd(w) with REG-to-XMM operands is not affected by the encoding flag. --- test/misc.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++ xbyak/xbyak.h | 9 +++++---- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/test/misc.cpp b/test/misc.cpp index bc5083b3..b4874a7a 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -2284,4 +2284,48 @@ CYBOZU_TEST_AUTO(avx_vnni_int) CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } +CYBOZU_TEST_AUTO(vmovd) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + setDefaultEncodingAVX10(PreAVX10v2Encoding); + vmovd(eax, xm1); // always AVX10.1 + vmovd(xm3, xm1); // always AVX10.2 + // AVX-512 (AVX10.1) + vmovd(ptr[rax+128], xm1); + vmovd(xm1, ptr[rax+128]); + vmovd(ptr[rax+128], xm30); + vmovd(xm30, ptr[rax+128]); + + setDefaultEncodingAVX10(AVX10v2Encoding); + vmovd(eax, xm1); // always AVX10.1 + vmovd(xm3, xm1); // always AVX10.2 + // AVX10.2 + vmovd(ptr[rax+128], xm1); + vmovd(xm1, ptr[rax+128]); + vmovd(ptr[rax+128], xm30); + vmovd(xm30, ptr[rax+128]); + } + } c; + const uint8_t tbl[] = { + 0xc5, 0xf9, 0x7e, 0xc8, // avx10.1 + 0x62, 0xf1, 0x7e, 0x08, 0x7e, 0xd9, // avx10.2 + 0xc5, 0xf9, 0x7e, 0x88, 0x80, 0x00, 0x00, 0x00, // avx + 0xc5, 0xf9, 0x6e, 0x88, 0x80, 0x00, 0x00, 0x00, // avx + 0x62, 0x61, 0x7d, 0x08, 0x7e, 0x70, 0x20, // avx10.1 + 0x62, 0x61, 0x7d, 0x08, 0x6e, 0x70, 0x20, // avx10.1 + + 0xc5, 0xf9, 0x7e, 0xc8, // avx10.1 + 0x62, 0xf1, 0x7e, 0x08, 0x7e, 0xd9, // avx10.2 + 0x62, 0xf1, 0x7d, 0x08, 0xd6, 0x48, 0x20, // avx10.2 + 0x62, 0xf1, 0x7e, 0x08, 0x7e, 0x48, 0x20, // avx10.2 + 0x62, 0x61, 0x7d, 0x08, 0xd6, 0x70, 0x20, // avx10.2 + 0x62, 0x61, 0x7e, 0x08, 0x7e, 0x70, 0x20, // avx10.2 + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} + #endif diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index b3a4df86..001f5aeb 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2809,11 +2809,12 @@ class CodeGenerator : public CodeArray { std::swap(p1, p2); rev = !rev; } + enc = getEncoding(enc, 1); int sel = -1; - if (getEncoding(enc, 1) == AVX10v2Encoding) { - if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev); - } else { - if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev); + if (p1->isXMM() || (p1->isMEM() && enc == AVX10v2Encoding)) { + sel = 2 + int(rev); + } else if (p1->isREG(bit) || p1->isMEM()) { + sel = int(rev); } if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(*static_cast(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]); From 6d48afc6c04e044bc1cb5c3817e85b68963909f3 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sat, 19 Oct 2024 08:54:43 +0900 Subject: [PATCH 06/10] [skip ci] [doc] update encoding of vmovd and vmovw with REG-to-XMM --- doc/usage.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 132020ae..b28d772c 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -154,11 +154,17 @@ feature|AVX512-VNNI|AVX-VNNI -|-|- feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2 -- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw +- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds and vmovd, vmovw with MEM-to-MEM. -- Remark: vmovd and vmovw several kinds of encoding such as AVX/AVX512F/AVX512-FP16/AVX10.2. -At first, I attempted to use EvexEncoding (resp. VexEncoding) instead of AVX10v2Encoding (resp. EvexEncoding) for `setDefaultEncodingAVX10`. -But I abandoned this idea when I found that `vmovd` and `vmovw` had different EVEX encodings in AVX512 and AVX10.2 +### Remark + +1. `vmovd` and `vmovw` instructions with REG-to-XMM or XMM-to-REG operands are always encoded using AVX10.1. +When used with XMM-to-XMM operands, these instructions are always encoded using AVX10.2. + +2. `vmovd` and `vmovw` instructions with MEM-to-MEM operands support multiple encoding formats, including AVX, AVX512F, AVX512-FP16, and AVX10.2. + +Initially, I tried implementing `setDefaultEncodingAVX10` using `EvexEncoding` (resp. `VexEncoding`) instead of `AVX10v2Encoding` (resp. `EvexEncoding`). +However, I abandoned this approach after discovering the complexity of the encoding requirements of `vmovd` and `vmovw`. ## APX [Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html) From ab9653f483bd9b985524a0e2bb30cdb7d3b79e9d Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sat, 19 Oct 2024 09:43:24 +0900 Subject: [PATCH 07/10] add test of vmovw --- test/misc.cpp | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/test/misc.cpp b/test/misc.cpp index b4874a7a..3ebb74c7 100644 --- a/test/misc.cpp +++ b/test/misc.cpp @@ -2291,6 +2291,7 @@ CYBOZU_TEST_AUTO(vmovd) { setDefaultEncodingAVX10(PreAVX10v2Encoding); vmovd(eax, xm1); // always AVX10.1 + vmovd(xm1, eax); // always AVX10.1 vmovd(xm3, xm1); // always AVX10.2 // AVX-512 (AVX10.1) vmovd(ptr[rax+128], xm1); @@ -2300,6 +2301,7 @@ CYBOZU_TEST_AUTO(vmovd) setDefaultEncodingAVX10(AVX10v2Encoding); vmovd(eax, xm1); // always AVX10.1 + vmovd(xm1, eax); // always AVX10.1 vmovd(xm3, xm1); // always AVX10.2 // AVX10.2 vmovd(ptr[rax+128], xm1); @@ -2310,6 +2312,7 @@ CYBOZU_TEST_AUTO(vmovd) } c; const uint8_t tbl[] = { 0xc5, 0xf9, 0x7e, 0xc8, // avx10.1 + 0xc5, 0xf9, 0x6e, 0xc8, // avx10.1 0x62, 0xf1, 0x7e, 0x08, 0x7e, 0xd9, // avx10.2 0xc5, 0xf9, 0x7e, 0x88, 0x80, 0x00, 0x00, 0x00, // avx 0xc5, 0xf9, 0x6e, 0x88, 0x80, 0x00, 0x00, 0x00, // avx @@ -2317,6 +2320,7 @@ CYBOZU_TEST_AUTO(vmovd) 0x62, 0x61, 0x7d, 0x08, 0x6e, 0x70, 0x20, // avx10.1 0xc5, 0xf9, 0x7e, 0xc8, // avx10.1 + 0xc5, 0xf9, 0x6e, 0xc8, // avx10.1 0x62, 0xf1, 0x7e, 0x08, 0x7e, 0xd9, // avx10.2 0x62, 0xf1, 0x7d, 0x08, 0xd6, 0x48, 0x20, // avx10.2 0x62, 0xf1, 0x7e, 0x08, 0x7e, 0x48, 0x20, // avx10.2 @@ -2328,4 +2332,52 @@ CYBOZU_TEST_AUTO(vmovd) CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } +CYBOZU_TEST_AUTO(vmovw) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + setDefaultEncodingAVX10(PreAVX10v2Encoding); + vmovw(eax, xm1); // always avx10.1 + vmovw(xm1, eax); // always avx10.1 + vmovw(xm3, xm1); // always avx10.2 + // AVX10.1 + vmovw(ptr[rax+128], xm1); + vmovw(xm1, ptr[rax+128]); + vmovw(ptr[rax+128], xm30); + vmovw(xm30, ptr[rax+128]); + + setDefaultEncodingAVX10(AVX10v2Encoding); + vmovw(eax, xm1); // always avx10.1 + vmovw(xm1, eax); // always avx10.1 + vmovw(xm3, xm1); // always avx10.2 + // AVX10.2 + vmovw(ptr[rax+128], xm1); + vmovw(xm1, ptr[rax+128]); + vmovw(ptr[rax+128], xm30); + vmovw(xm30, ptr[rax+128]); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf5, 0x7d, 0x08, 0x7e, 0xc8, + 0x62, 0xf5, 0x7d, 0x08, 0x6e, 0xc8, + 0x62, 0xf5, 0x7e, 0x08, 0x6e, 0xd9, + 0x62, 0xf5, 0x7d, 0x08, 0x7e, 0x48, 0x40, + 0x62, 0xf5, 0x7d, 0x08, 0x6e, 0x48, 0x40, + 0x62, 0x65, 0x7d, 0x08, 0x7e, 0x70, 0x40, + 0x62, 0x65, 0x7d, 0x08, 0x6e, 0x70, 0x40, + + 0x62, 0xf5, 0x7d, 0x08, 0x7e, 0xc8, + 0x62, 0xf5, 0x7d, 0x08, 0x6e, 0xc8, + 0x62, 0xf5, 0x7e, 0x08, 0x6e, 0xd9, + 0x62, 0xf5, 0x7e, 0x08, 0x7e, 0x48, 0x40, + 0x62, 0xf5, 0x7e, 0x08, 0x6e, 0x48, 0x40, + 0x62, 0x65, 0x7e, 0x08, 0x7e, 0x70, 0x40, + 0x62, 0x65, 0x7e, 0x08, 0x6e, 0x70, 0x40, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} + #endif From 0a6ca187614a066f9c0b5ed8cf3bbb1c45f2e11a Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sat, 19 Oct 2024 09:43:44 +0900 Subject: [PATCH 08/10] add test_by_xed for win --- test/test_by_xed.bat | 6 ++++++ test/test_by_xed_all.bat | 5 +++++ 2 files changed, 11 insertions(+) create mode 100644 test/test_by_xed.bat create mode 100644 test/test_by_xed_all.bat diff --git a/test/test_by_xed.bat b/test/test_by_xed.bat new file mode 100644 index 00000000..bf6ee5ec --- /dev/null +++ b/test/test_by_xed.bat @@ -0,0 +1,6 @@ +@echo off +set CFLAGS=-I ../ /EHsc /nologo +copy %1% tmp.cpp +cl %CFLAGS% test_by_xed.cpp && test_by_xed.exe +%XED% -64 -ir bin > out.txt +python3 test_by_xed.py %1% out.txt diff --git a/test/test_by_xed_all.bat b/test/test_by_xed_all.bat new file mode 100644 index 00000000..bb57cb40 --- /dev/null +++ b/test/test_by_xed_all.bat @@ -0,0 +1,5 @@ +set TARGETS=old.txt new-ymm.txt bf16.txt misc.txt convert.txt minmax.txt saturation.txt +for %%f in (%TARGETS%) do ( + echo %%f + call test_by_xed.bat avx10\%%f +) \ No newline at end of file From 8939b5a2d69f25b1598d400133740f797eebd63f Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 20 Oct 2024 05:44:07 +0900 Subject: [PATCH 09/10] [skip ci] [doc] tweak --- doc/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/usage.md b/doc/usage.md index b28d772c..5e1946a9 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -161,7 +161,7 @@ feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2 1. `vmovd` and `vmovw` instructions with REG-to-XMM or XMM-to-REG operands are always encoded using AVX10.1. When used with XMM-to-XMM operands, these instructions are always encoded using AVX10.2. -2. `vmovd` and `vmovw` instructions with MEM-to-MEM operands support multiple encoding formats, including AVX, AVX512F, AVX512-FP16, and AVX10.2. +2. `vmovd` and `vmovw` instructions with XMM-to-MEM or MEM-to-XMM operands support multiple encoding formats, including AVX, AVX512F, AVX512-FP16, and AVX10.2. Initially, I tried implementing `setDefaultEncodingAVX10` using `EvexEncoding` (resp. `VexEncoding`) instead of `AVX10v2Encoding` (resp. `EvexEncoding`). However, I abandoned this approach after discovering the complexity of the encoding requirements of `vmovd` and `vmovw`. From 565ad4e809c1aa80e295613347420812b3b5ac1a Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 30 Oct 2024 06:39:05 +0900 Subject: [PATCH 10/10] v7.20.1 --- CMakeLists.txt | 2 +- doc/changelog.md | 1 + meson.build | 2 +- readme.md | 2 +- readme.txt | 3 ++- xbyak/xbyak.h | 2 +- xbyak/xbyak_mnemonic.h | 2 +- 7 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ded27c3..5b065121 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) -project(xbyak LANGUAGES CXX VERSION 7.20) +project(xbyak LANGUAGES CXX VERSION 7.20.1) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index e7264888..47117d8d 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,6 @@ # History +* 2024/Oct/17 ver 7.20.1 Updated to comply with AVX10.2 specification rev 2.0 * 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10. * 2024/Oct/15 ver 7.11 Added full support for AVX10.2 * 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. diff --git a/meson.build b/meson.build index b69a379b..5bb3b78e 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '7.20', + version: '7.20.1', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) diff --git a/readme.md b/readme.md index 322d09be..a5dabdd3 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -# Xbyak 7.20 [![Badge Build]][Build Status] +# Xbyak 7.20.1 [![Badge Build]][Build Status] *A JIT assembler for x86/x64 architectures supporting advanced instruction sets up to AVX10.2* diff --git a/readme.txt b/readme.txt index 65527f39..82083cd8 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20.1 ----------------------------------------------------------------------------- ◎概要 @@ -404,6 +404,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から ----------------------------------------------------------------------------- ◎履歴 +2024/10/17 ver 7.20.1 AVX10.2 rev 2.0仕様書の変更に追従 2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定 2024/10/15 ver 7.11 AVX10.2完全サポート 2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張. diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 001f5aeb..5982a5d0 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7200 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7201 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index d77daeca..4854e0c1 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.20"; } +const char *getVersionString() const { return "7.20.1"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }