From 78173ec6311a22ca9f42cf949cf37754a8b71633 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:27 +0300 Subject: x86/insn: perf tools: Pedantically tweak opcode map for MPX instructions The MPX instructions are presently not described in the SDM opcode maps, and there are not encoding characters for bnd registers, address method or operand type. So the kernel opcode map is using 'Gv' for bnd registers and 'Ev' for everything else. That is fine because the instruction decoder does not use that information anyway, except as an indication that there is a ModR/M byte. Nevertheless, in some cases the 'Gv' and 'Ev' are the wrong way around, BNDLDX and BNDSTX have 2 operands not 3, and it wouldn't hurt to identify the mandatory prefixes. This has no effect on the decoding of valid instructions, but the addition of the mandatory prefixes will cause some invalid instructions to error out that wouldn't have previously. Note that perf tools has a copy of the instruction decoder and provides a test for new instructions which includes MPX instructions e.g. $ perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ perf test -v "x86 ins" Commiter notes: And to see these MPX instructions specifically: $ perf test -v "x86 ins" 2>&1 | grep bndldx | head -3 Decoded ok: 0f 1a 00 bndldx (%eax),%bnd0 Decoded ok: 0f 1a 05 78 56 34 12 bndldx 0x12345678,%bnd0 Decoded ok: 0f 1a 18 bndldx (%eax),%bnd3 $ perf test -v "x86 ins" 2>&1 | grep bndstx | head -3 Decoded ok: 0f 1b 00 bndstx %bnd0,(%eax) Decoded ok: 0f 1b 05 78 56 34 12 bndstx %bnd0,0x12345678 Decoded ok: 0f 1b 18 bndstx %bnd3,(%eax) $ Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-4-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 816488c0b97e..a02a195d219c 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -353,8 +353,12 @@ AVXcode: 1 17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 18: Grp16 (1A) 19: -1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv -1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv +# Intel SDM opcode map does not list MPX instructions. For now using Gv for +# bnd registers and Ev for everything else is OK because the instruction +# decoder does not use the information except as an indication that there is +# a ModR/M byte. +1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev +1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv 1c: 1d: 1e: -- cgit v1.2.3 From 3fe78d6af9e2f08c4014fd3ccbf9e1ff312dedf1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:28 +0300 Subject: x86/insn: perf tools: Add new SHA instructions Intel SHA Extensions are explained in the Intel Architecture Instruction Set Extensions Programing Reference (Oct 2014). There are 7 new instructions. Add them to the op code map and the perf tools new instructions test. e.g. $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" 2>&1 | grep sha Committer note: 3 lines of details, for the curious: $ perf test -v "x86 ins" 2>&1 | grep sha256msg1 | tail -3 Decoded ok: 0f 38 cc 84 08 78 56 34 12 sha256msg1 0x12345678(%rax,%rcx,1),%xmm0 Decoded ok: 0f 38 cc 84 c8 78 56 34 12 sha256msg1 0x12345678(%rax,%rcx,8),%xmm0 Decoded ok: 44 0f 38 cc bc c8 78 56 34 12 sha256msg1 0x12345678(%rax,%rcx,8),%xmm15 $ Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-5-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 7 + tools/perf/tests/insn-x86-dat-32.c | 294 ++++++++++++++++ tools/perf/tests/insn-x86-dat-64.c | 364 ++++++++++++++++++++ tools/perf/tests/insn-x86-dat-src.c | 373 +++++++++++++++++++++ .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 7 + 5 files changed, 1045 insertions(+) (limited to 'arch') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index a02a195d219c..25dad388b371 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -736,6 +736,12 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff +c8: sha1nexte Vdq,Wdq +c9: sha1msg1 Vdq,Wdq +ca: sha1msg2 Vdq,Wdq +cb: sha256rnds2 Vdq,Wdq +cc: sha256msg1 Vdq,Wdq +cd: sha256msg2 Vdq,Wdq db: VAESIMC Vdq,Wdq (66),(v1) dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) @@ -794,6 +800,7 @@ AVXcode: 3 61: vpcmpestri Vdq,Wdq,Ib (66),(v1) 62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) 63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +cc: sha1rnds4 Vdq,Wdq,Ib df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) EndTable diff --git a/tools/perf/tests/insn-x86-dat-32.c b/tools/perf/tests/insn-x86-dat-32.c index 6a38a34a5a49..83f5078e74e1 100644 --- a/tools/perf/tests/insn-x86-dat-32.c +++ b/tools/perf/tests/insn-x86-dat-32.c @@ -322,3 +322,297 @@ "f2 ff 21 \tbnd jmp *(%ecx)",}, {{0xf2, 0x0f, 0x85, 0xfc, 0xff, 0xff, 0xff, }, 7, 0xfffffffc, "jcc", "conditional", "f2 0f 85 fc ff ff ff \tbnd jne 3de ",}, +{{0x0f, 0x3a, 0xcc, 0xc1, 0x00, }, 5, 0, "", "", +"0f 3a cc c1 00 \tsha1rnds4 $0x0,%xmm1,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0xd7, 0x91, }, 5, 0, "", "", +"0f 3a cc d7 91 \tsha1rnds4 $0x91,%xmm7,%xmm2",}, +{{0x0f, 0x3a, 0xcc, 0x00, 0x91, }, 5, 0, "", "", +"0f 3a cc 00 91 \tsha1rnds4 $0x91,(%eax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x18, 0x91, }, 5, 0, "", "", +"0f 3a cc 18 91 \tsha1rnds4 $0x91,(%eax),%xmm3",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x01, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 01 91 \tsha1rnds4 $0x91,(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 04 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x08, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 08 91 \tsha1rnds4 $0x91,(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0xc8, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 c8 91 \tsha1rnds4 $0x91,(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x40, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 40 12 91 \tsha1rnds4 $0x91,0x12(%eax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x45, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 45 12 91 \tsha1rnds4 $0x91,0x12(%ebp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x01, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 01 12 91 \tsha1rnds4 $0x91,0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x05, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 05 12 91 \tsha1rnds4 $0x91,0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x08, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 08 12 91 \tsha1rnds4 $0x91,0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0xc8, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 c8 12 91 \tsha1rnds4 $0x91,0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 80 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%eax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 85 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 01 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 08 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 c8 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0xc1, }, 4, 0, "", "", +"0f 38 c8 c1 \tsha1nexte %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0xd7, }, 4, 0, "", "", +"0f 38 c8 d7 \tsha1nexte %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xc8, 0x00, }, 4, 0, "", "", +"0f 38 c8 00 \tsha1nexte (%eax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 05 78 56 34 12 \tsha1nexte 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x18, }, 4, 0, "", "", +"0f 38 c8 18 \tsha1nexte (%eax),%xmm3",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c8 04 01 \tsha1nexte (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 04 05 78 56 34 12 \tsha1nexte 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c8 04 08 \tsha1nexte (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c8 04 c8 \tsha1nexte (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c8 40 12 \tsha1nexte 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c8 45 12 \tsha1nexte 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 01 12 \tsha1nexte 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 05 12 \tsha1nexte 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 08 12 \tsha1nexte 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 c8 12 \tsha1nexte 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 80 78 56 34 12 \tsha1nexte 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 85 78 56 34 12 \tsha1nexte 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 01 78 56 34 12 \tsha1nexte 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 05 78 56 34 12 \tsha1nexte 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 08 78 56 34 12 \tsha1nexte 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 c8 78 56 34 12 \tsha1nexte 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0xc1, }, 4, 0, "", "", +"0f 38 c9 c1 \tsha1msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0xd7, }, 4, 0, "", "", +"0f 38 c9 d7 \tsha1msg1 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xc9, 0x00, }, 4, 0, "", "", +"0f 38 c9 00 \tsha1msg1 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 05 78 56 34 12 \tsha1msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x18, }, 4, 0, "", "", +"0f 38 c9 18 \tsha1msg1 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c9 04 01 \tsha1msg1 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 04 05 78 56 34 12 \tsha1msg1 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c9 04 08 \tsha1msg1 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c9 04 c8 \tsha1msg1 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c9 40 12 \tsha1msg1 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c9 45 12 \tsha1msg1 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 01 12 \tsha1msg1 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 05 12 \tsha1msg1 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 08 12 \tsha1msg1 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 c8 12 \tsha1msg1 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 80 78 56 34 12 \tsha1msg1 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 85 78 56 34 12 \tsha1msg1 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 01 78 56 34 12 \tsha1msg1 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 05 78 56 34 12 \tsha1msg1 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 08 78 56 34 12 \tsha1msg1 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 c8 78 56 34 12 \tsha1msg1 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0xc1, }, 4, 0, "", "", +"0f 38 ca c1 \tsha1msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xca, 0xd7, }, 4, 0, "", "", +"0f 38 ca d7 \tsha1msg2 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xca, 0x00, }, 4, 0, "", "", +"0f 38 ca 00 \tsha1msg2 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 05 78 56 34 12 \tsha1msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xca, 0x18, }, 4, 0, "", "", +"0f 38 ca 18 \tsha1msg2 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xca, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 ca 04 01 \tsha1msg2 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 04 05 78 56 34 12 \tsha1msg2 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 ca 04 08 \tsha1msg2 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 ca 04 c8 \tsha1msg2 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 ca 40 12 \tsha1msg2 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 ca 45 12 \tsha1msg2 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 01 12 \tsha1msg2 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 05 12 \tsha1msg2 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 08 12 \tsha1msg2 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 c8 12 \tsha1msg2 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 80 78 56 34 12 \tsha1msg2 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 85 78 56 34 12 \tsha1msg2 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 01 78 56 34 12 \tsha1msg2 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 05 78 56 34 12 \tsha1msg2 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 08 78 56 34 12 \tsha1msg2 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 c8 78 56 34 12 \tsha1msg2 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcb, 0xcc, }, 4, 0, "", "", +"0f 38 cb cc \tsha256rnds2 %xmm0,%xmm4,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0xd7, }, 4, 0, "", "", +"0f 38 cb d7 \tsha256rnds2 %xmm0,%xmm7,%xmm2",}, +{{0x0f, 0x38, 0xcb, 0x08, }, 4, 0, "", "", +"0f 38 cb 08 \tsha256rnds2 %xmm0,(%eax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 0d 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x18, }, 4, 0, "", "", +"0f 38 cb 18 \tsha256rnds2 %xmm0,(%eax),%xmm3",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x01, }, 5, 0, "", "", +"0f 38 cb 0c 01 \tsha256rnds2 %xmm0,(%ecx,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 0c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x08, }, 5, 0, "", "", +"0f 38 cb 0c 08 \tsha256rnds2 %xmm0,(%eax,%ecx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0xc8, }, 5, 0, "", "", +"0f 38 cb 0c c8 \tsha256rnds2 %xmm0,(%eax,%ecx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x48, 0x12, }, 5, 0, "", "", +"0f 38 cb 48 12 \tsha256rnds2 %xmm0,0x12(%eax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4d, 0x12, }, 5, 0, "", "", +"0f 38 cb 4d 12 \tsha256rnds2 %xmm0,0x12(%ebp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 01 12 \tsha256rnds2 %xmm0,0x12(%ecx,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 05 12 \tsha256rnds2 %xmm0,0x12(%ebp,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 08 12 \tsha256rnds2 %xmm0,0x12(%eax,%ecx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c c8 12 \tsha256rnds2 %xmm0,0x12(%eax,%ecx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x88, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 88 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%eax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 8d 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%ebp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 01 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%ecx,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%ebp,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 08 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%eax,%ecx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c c8 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%eax,%ecx,8),%xmm1",}, +{{0x0f, 0x38, 0xcc, 0xc1, }, 4, 0, "", "", +"0f 38 cc c1 \tsha256msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0xd7, }, 4, 0, "", "", +"0f 38 cc d7 \tsha256msg1 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xcc, 0x00, }, 4, 0, "", "", +"0f 38 cc 00 \tsha256msg1 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 05 78 56 34 12 \tsha256msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x18, }, 4, 0, "", "", +"0f 38 cc 18 \tsha256msg1 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cc 04 01 \tsha256msg1 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 04 05 78 56 34 12 \tsha256msg1 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cc 04 08 \tsha256msg1 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cc 04 c8 \tsha256msg1 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cc 40 12 \tsha256msg1 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cc 45 12 \tsha256msg1 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 01 12 \tsha256msg1 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 05 12 \tsha256msg1 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 08 12 \tsha256msg1 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 c8 12 \tsha256msg1 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 80 78 56 34 12 \tsha256msg1 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 85 78 56 34 12 \tsha256msg1 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 01 78 56 34 12 \tsha256msg1 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 05 78 56 34 12 \tsha256msg1 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 08 78 56 34 12 \tsha256msg1 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 c8 78 56 34 12 \tsha256msg1 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0xc1, }, 4, 0, "", "", +"0f 38 cd c1 \tsha256msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0xd7, }, 4, 0, "", "", +"0f 38 cd d7 \tsha256msg2 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xcd, 0x00, }, 4, 0, "", "", +"0f 38 cd 00 \tsha256msg2 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 05 78 56 34 12 \tsha256msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x18, }, 4, 0, "", "", +"0f 38 cd 18 \tsha256msg2 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cd 04 01 \tsha256msg2 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 04 05 78 56 34 12 \tsha256msg2 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cd 04 08 \tsha256msg2 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cd 04 c8 \tsha256msg2 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cd 40 12 \tsha256msg2 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cd 45 12 \tsha256msg2 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 01 12 \tsha256msg2 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 05 12 \tsha256msg2 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 08 12 \tsha256msg2 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 c8 12 \tsha256msg2 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 80 78 56 34 12 \tsha256msg2 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 85 78 56 34 12 \tsha256msg2 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 01 78 56 34 12 \tsha256msg2 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 05 78 56 34 12 \tsha256msg2 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 08 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,8),%xmm0",}, diff --git a/tools/perf/tests/insn-x86-dat-64.c b/tools/perf/tests/insn-x86-dat-64.c index 01122421a776..13f008588590 100644 --- a/tools/perf/tests/insn-x86-dat-64.c +++ b/tools/perf/tests/insn-x86-dat-64.c @@ -338,3 +338,367 @@ "67 f2 ff 21 \tbnd jmpq *(%ecx)",}, {{0xf2, 0x0f, 0x85, 0x00, 0x00, 0x00, 0x00, }, 7, 0, "jcc", "conditional", "f2 0f 85 00 00 00 00 \tbnd jne 413 ",}, +{{0x0f, 0x3a, 0xcc, 0xc1, 0x00, }, 5, 0, "", "", +"0f 3a cc c1 00 \tsha1rnds4 $0x0,%xmm1,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0xd7, 0x91, }, 5, 0, "", "", +"0f 3a cc d7 91 \tsha1rnds4 $0x91,%xmm7,%xmm2",}, +{{0x41, 0x0f, 0x3a, 0xcc, 0xc0, 0x91, }, 6, 0, "", "", +"41 0f 3a cc c0 91 \tsha1rnds4 $0x91,%xmm8,%xmm0",}, +{{0x44, 0x0f, 0x3a, 0xcc, 0xc7, 0x91, }, 6, 0, "", "", +"44 0f 3a cc c7 91 \tsha1rnds4 $0x91,%xmm7,%xmm8",}, +{{0x45, 0x0f, 0x3a, 0xcc, 0xc7, 0x91, }, 6, 0, "", "", +"45 0f 3a cc c7 91 \tsha1rnds4 $0x91,%xmm15,%xmm8",}, +{{0x0f, 0x3a, 0xcc, 0x00, 0x91, }, 5, 0, "", "", +"0f 3a cc 00 91 \tsha1rnds4 $0x91,(%rax),%xmm0",}, +{{0x41, 0x0f, 0x3a, 0xcc, 0x00, 0x91, }, 6, 0, "", "", +"41 0f 3a cc 00 91 \tsha1rnds4 $0x91,(%r8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 04 25 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x18, 0x91, }, 5, 0, "", "", +"0f 3a cc 18 91 \tsha1rnds4 $0x91,(%rax),%xmm3",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x01, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 01 91 \tsha1rnds4 $0x91,(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 04 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x08, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 08 91 \tsha1rnds4 $0x91,(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0xc8, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 c8 91 \tsha1rnds4 $0x91,(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x40, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 40 12 91 \tsha1rnds4 $0x91,0x12(%rax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x45, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 45 12 91 \tsha1rnds4 $0x91,0x12(%rbp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x01, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 01 12 91 \tsha1rnds4 $0x91,0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x05, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 05 12 91 \tsha1rnds4 $0x91,0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x08, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 08 12 91 \tsha1rnds4 $0x91,0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0xc8, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 c8 12 91 \tsha1rnds4 $0x91,0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 80 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 85 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 01 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 08 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 c8 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x3a, 0xcc, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, 0x91, }, 11, 0, "", "", +"44 0f 3a cc bc c8 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xc8, 0xc1, }, 4, 0, "", "", +"0f 38 c8 c1 \tsha1nexte %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0xd7, }, 4, 0, "", "", +"0f 38 c8 d7 \tsha1nexte %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xc8, 0xc0, }, 5, 0, "", "", +"41 0f 38 c8 c0 \tsha1nexte %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc8, 0xc7, }, 5, 0, "", "", +"44 0f 38 c8 c7 \tsha1nexte %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xc8, 0xc7, }, 5, 0, "", "", +"45 0f 38 c8 c7 \tsha1nexte %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xc8, 0x00, }, 4, 0, "", "", +"0f 38 c8 00 \tsha1nexte (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xc8, 0x00, }, 5, 0, "", "", +"41 0f 38 c8 00 \tsha1nexte (%r8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 04 25 78 56 34 12 \tsha1nexte 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x18, }, 4, 0, "", "", +"0f 38 c8 18 \tsha1nexte (%rax),%xmm3",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c8 04 01 \tsha1nexte (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 04 05 78 56 34 12 \tsha1nexte 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c8 04 08 \tsha1nexte (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c8 04 c8 \tsha1nexte (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c8 40 12 \tsha1nexte 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c8 45 12 \tsha1nexte 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 01 12 \tsha1nexte 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 05 12 \tsha1nexte 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 08 12 \tsha1nexte 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 c8 12 \tsha1nexte 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 80 78 56 34 12 \tsha1nexte 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 85 78 56 34 12 \tsha1nexte 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 01 78 56 34 12 \tsha1nexte 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 05 78 56 34 12 \tsha1nexte 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 08 78 56 34 12 \tsha1nexte 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 c8 78 56 34 12 \tsha1nexte 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc8, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 c8 bc c8 78 56 34 12 \tsha1nexte 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xc9, 0xc1, }, 4, 0, "", "", +"0f 38 c9 c1 \tsha1msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0xd7, }, 4, 0, "", "", +"0f 38 c9 d7 \tsha1msg1 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xc9, 0xc0, }, 5, 0, "", "", +"41 0f 38 c9 c0 \tsha1msg1 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc9, 0xc7, }, 5, 0, "", "", +"44 0f 38 c9 c7 \tsha1msg1 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xc9, 0xc7, }, 5, 0, "", "", +"45 0f 38 c9 c7 \tsha1msg1 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xc9, 0x00, }, 4, 0, "", "", +"0f 38 c9 00 \tsha1msg1 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xc9, 0x00, }, 5, 0, "", "", +"41 0f 38 c9 00 \tsha1msg1 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 04 25 78 56 34 12 \tsha1msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x18, }, 4, 0, "", "", +"0f 38 c9 18 \tsha1msg1 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c9 04 01 \tsha1msg1 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 04 05 78 56 34 12 \tsha1msg1 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c9 04 08 \tsha1msg1 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c9 04 c8 \tsha1msg1 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c9 40 12 \tsha1msg1 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c9 45 12 \tsha1msg1 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 01 12 \tsha1msg1 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 05 12 \tsha1msg1 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 08 12 \tsha1msg1 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 c8 12 \tsha1msg1 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 80 78 56 34 12 \tsha1msg1 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 85 78 56 34 12 \tsha1msg1 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 01 78 56 34 12 \tsha1msg1 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 05 78 56 34 12 \tsha1msg1 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 08 78 56 34 12 \tsha1msg1 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 c8 78 56 34 12 \tsha1msg1 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc9, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 c9 bc c8 78 56 34 12 \tsha1msg1 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xca, 0xc1, }, 4, 0, "", "", +"0f 38 ca c1 \tsha1msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xca, 0xd7, }, 4, 0, "", "", +"0f 38 ca d7 \tsha1msg2 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xca, 0xc0, }, 5, 0, "", "", +"41 0f 38 ca c0 \tsha1msg2 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xca, 0xc7, }, 5, 0, "", "", +"44 0f 38 ca c7 \tsha1msg2 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xca, 0xc7, }, 5, 0, "", "", +"45 0f 38 ca c7 \tsha1msg2 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xca, 0x00, }, 4, 0, "", "", +"0f 38 ca 00 \tsha1msg2 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xca, 0x00, }, 5, 0, "", "", +"41 0f 38 ca 00 \tsha1msg2 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 04 25 78 56 34 12 \tsha1msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xca, 0x18, }, 4, 0, "", "", +"0f 38 ca 18 \tsha1msg2 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xca, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 ca 04 01 \tsha1msg2 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 04 05 78 56 34 12 \tsha1msg2 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 ca 04 08 \tsha1msg2 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 ca 04 c8 \tsha1msg2 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 ca 40 12 \tsha1msg2 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 ca 45 12 \tsha1msg2 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 01 12 \tsha1msg2 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 05 12 \tsha1msg2 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 08 12 \tsha1msg2 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 c8 12 \tsha1msg2 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 80 78 56 34 12 \tsha1msg2 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 85 78 56 34 12 \tsha1msg2 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 01 78 56 34 12 \tsha1msg2 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 05 78 56 34 12 \tsha1msg2 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 08 78 56 34 12 \tsha1msg2 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 c8 78 56 34 12 \tsha1msg2 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xca, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 ca bc c8 78 56 34 12 \tsha1msg2 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xcb, 0xcc, }, 4, 0, "", "", +"0f 38 cb cc \tsha256rnds2 %xmm0,%xmm4,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0xd7, }, 4, 0, "", "", +"0f 38 cb d7 \tsha256rnds2 %xmm0,%xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xcb, 0xc8, }, 5, 0, "", "", +"41 0f 38 cb c8 \tsha256rnds2 %xmm0,%xmm8,%xmm1",}, +{{0x44, 0x0f, 0x38, 0xcb, 0xc7, }, 5, 0, "", "", +"44 0f 38 cb c7 \tsha256rnds2 %xmm0,%xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xcb, 0xc7, }, 5, 0, "", "", +"45 0f 38 cb c7 \tsha256rnds2 %xmm0,%xmm15,%xmm8",}, +{{0x0f, 0x38, 0xcb, 0x08, }, 4, 0, "", "", +"0f 38 cb 08 \tsha256rnds2 %xmm0,(%rax),%xmm1",}, +{{0x41, 0x0f, 0x38, 0xcb, 0x08, }, 5, 0, "", "", +"41 0f 38 cb 08 \tsha256rnds2 %xmm0,(%r8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 0c 25 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x18, }, 4, 0, "", "", +"0f 38 cb 18 \tsha256rnds2 %xmm0,(%rax),%xmm3",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x01, }, 5, 0, "", "", +"0f 38 cb 0c 01 \tsha256rnds2 %xmm0,(%rcx,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 0c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x08, }, 5, 0, "", "", +"0f 38 cb 0c 08 \tsha256rnds2 %xmm0,(%rax,%rcx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0xc8, }, 5, 0, "", "", +"0f 38 cb 0c c8 \tsha256rnds2 %xmm0,(%rax,%rcx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x48, 0x12, }, 5, 0, "", "", +"0f 38 cb 48 12 \tsha256rnds2 %xmm0,0x12(%rax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4d, 0x12, }, 5, 0, "", "", +"0f 38 cb 4d 12 \tsha256rnds2 %xmm0,0x12(%rbp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 01 12 \tsha256rnds2 %xmm0,0x12(%rcx,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 05 12 \tsha256rnds2 %xmm0,0x12(%rbp,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 08 12 \tsha256rnds2 %xmm0,0x12(%rax,%rcx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c c8 12 \tsha256rnds2 %xmm0,0x12(%rax,%rcx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x88, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 88 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 8d 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rbp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 01 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rcx,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rbp,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 08 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax,%rcx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c c8 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax,%rcx,8),%xmm1",}, +{{0x44, 0x0f, 0x38, 0xcb, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 cb bc c8 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xcc, 0xc1, }, 4, 0, "", "", +"0f 38 cc c1 \tsha256msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0xd7, }, 4, 0, "", "", +"0f 38 cc d7 \tsha256msg1 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xcc, 0xc0, }, 5, 0, "", "", +"41 0f 38 cc c0 \tsha256msg1 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcc, 0xc7, }, 5, 0, "", "", +"44 0f 38 cc c7 \tsha256msg1 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xcc, 0xc7, }, 5, 0, "", "", +"45 0f 38 cc c7 \tsha256msg1 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xcc, 0x00, }, 4, 0, "", "", +"0f 38 cc 00 \tsha256msg1 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xcc, 0x00, }, 5, 0, "", "", +"41 0f 38 cc 00 \tsha256msg1 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 04 25 78 56 34 12 \tsha256msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x18, }, 4, 0, "", "", +"0f 38 cc 18 \tsha256msg1 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cc 04 01 \tsha256msg1 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 04 05 78 56 34 12 \tsha256msg1 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cc 04 08 \tsha256msg1 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cc 04 c8 \tsha256msg1 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cc 40 12 \tsha256msg1 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cc 45 12 \tsha256msg1 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 01 12 \tsha256msg1 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 05 12 \tsha256msg1 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 08 12 \tsha256msg1 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 c8 12 \tsha256msg1 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 80 78 56 34 12 \tsha256msg1 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 85 78 56 34 12 \tsha256msg1 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 01 78 56 34 12 \tsha256msg1 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 05 78 56 34 12 \tsha256msg1 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 08 78 56 34 12 \tsha256msg1 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 c8 78 56 34 12 \tsha256msg1 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcc, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 cc bc c8 78 56 34 12 \tsha256msg1 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xcd, 0xc1, }, 4, 0, "", "", +"0f 38 cd c1 \tsha256msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0xd7, }, 4, 0, "", "", +"0f 38 cd d7 \tsha256msg2 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xcd, 0xc0, }, 5, 0, "", "", +"41 0f 38 cd c0 \tsha256msg2 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcd, 0xc7, }, 5, 0, "", "", +"44 0f 38 cd c7 \tsha256msg2 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xcd, 0xc7, }, 5, 0, "", "", +"45 0f 38 cd c7 \tsha256msg2 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xcd, 0x00, }, 4, 0, "", "", +"0f 38 cd 00 \tsha256msg2 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xcd, 0x00, }, 5, 0, "", "", +"41 0f 38 cd 00 \tsha256msg2 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 04 25 78 56 34 12 \tsha256msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x18, }, 4, 0, "", "", +"0f 38 cd 18 \tsha256msg2 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cd 04 01 \tsha256msg2 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 04 05 78 56 34 12 \tsha256msg2 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cd 04 08 \tsha256msg2 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cd 04 c8 \tsha256msg2 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cd 40 12 \tsha256msg2 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cd 45 12 \tsha256msg2 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 01 12 \tsha256msg2 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 05 12 \tsha256msg2 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 08 12 \tsha256msg2 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 c8 12 \tsha256msg2 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 80 78 56 34 12 \tsha256msg2 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 85 78 56 34 12 \tsha256msg2 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 01 78 56 34 12 \tsha256msg2 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 05 78 56 34 12 \tsha256msg2 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 08 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcd, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 cd bc c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm15",}, diff --git a/tools/perf/tests/insn-x86-dat-src.c b/tools/perf/tests/insn-x86-dat-src.c index b506830f33a8..7d06c9b22070 100644 --- a/tools/perf/tests/insn-x86-dat-src.c +++ b/tools/perf/tests/insn-x86-dat-src.c @@ -217,6 +217,210 @@ int main(void) asm volatile("bnd jmp *(%ecx)"); /* Expecting: jmp indirect 0 */ asm volatile("bnd jne label1"); /* Expecting: jcc conditional 0 */ + /* sha1rnds4 imm8, xmm2/m128, xmm1 */ + + asm volatile("sha1rnds4 $0x0, %xmm1, %xmm0"); + asm volatile("sha1rnds4 $0x91, %xmm7, %xmm2"); + asm volatile("sha1rnds4 $0x91, %xmm8, %xmm0"); + asm volatile("sha1rnds4 $0x91, %xmm7, %xmm8"); + asm volatile("sha1rnds4 $0x91, %xmm15, %xmm8"); + asm volatile("sha1rnds4 $0x91, (%rax), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%r8), %xmm0"); + asm volatile("sha1rnds4 $0x91, (0x12345678), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%rax), %xmm3"); + asm volatile("sha1rnds4 $0x91, (%rcx,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%rax,%rcx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%rax,%rcx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rbp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rbp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha1nexte xmm2/m128, xmm1 */ + + asm volatile("sha1nexte %xmm1, %xmm0"); + asm volatile("sha1nexte %xmm7, %xmm2"); + asm volatile("sha1nexte %xmm8, %xmm0"); + asm volatile("sha1nexte %xmm7, %xmm8"); + asm volatile("sha1nexte %xmm15, %xmm8"); + asm volatile("sha1nexte (%rax), %xmm0"); + asm volatile("sha1nexte (%r8), %xmm0"); + asm volatile("sha1nexte (0x12345678), %xmm0"); + asm volatile("sha1nexte (%rax), %xmm3"); + asm volatile("sha1nexte (%rcx,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1nexte (%rax,%rcx,1), %xmm0"); + asm volatile("sha1nexte (%rax,%rcx,8), %xmm0"); + asm volatile("sha1nexte 0x12(%rax), %xmm0"); + asm volatile("sha1nexte 0x12(%rbp), %xmm0"); + asm volatile("sha1nexte 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1nexte 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rbp), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha1msg1 xmm2/m128, xmm1 */ + + asm volatile("sha1msg1 %xmm1, %xmm0"); + asm volatile("sha1msg1 %xmm7, %xmm2"); + asm volatile("sha1msg1 %xmm8, %xmm0"); + asm volatile("sha1msg1 %xmm7, %xmm8"); + asm volatile("sha1msg1 %xmm15, %xmm8"); + asm volatile("sha1msg1 (%rax), %xmm0"); + asm volatile("sha1msg1 (%r8), %xmm0"); + asm volatile("sha1msg1 (0x12345678), %xmm0"); + asm volatile("sha1msg1 (%rax), %xmm3"); + asm volatile("sha1msg1 (%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1msg1 (%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg1 (%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg1 0x12(%rax), %xmm0"); + asm volatile("sha1msg1 0x12(%rbp), %xmm0"); + asm volatile("sha1msg1 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg1 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rbp), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha1msg2 xmm2/m128, xmm1 */ + + asm volatile("sha1msg2 %xmm1, %xmm0"); + asm volatile("sha1msg2 %xmm7, %xmm2"); + asm volatile("sha1msg2 %xmm8, %xmm0"); + asm volatile("sha1msg2 %xmm7, %xmm8"); + asm volatile("sha1msg2 %xmm15, %xmm8"); + asm volatile("sha1msg2 (%rax), %xmm0"); + asm volatile("sha1msg2 (%r8), %xmm0"); + asm volatile("sha1msg2 (0x12345678), %xmm0"); + asm volatile("sha1msg2 (%rax), %xmm3"); + asm volatile("sha1msg2 (%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1msg2 (%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg2 (%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg2 0x12(%rax), %xmm0"); + asm volatile("sha1msg2 0x12(%rbp), %xmm0"); + asm volatile("sha1msg2 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg2 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rbp), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha256rnds2 , xmm2/m128, xmm1 */ + /* Note sha256rnds2 has an implicit operand 'xmm0' */ + + asm volatile("sha256rnds2 %xmm4, %xmm1"); + asm volatile("sha256rnds2 %xmm7, %xmm2"); + asm volatile("sha256rnds2 %xmm8, %xmm1"); + asm volatile("sha256rnds2 %xmm7, %xmm8"); + asm volatile("sha256rnds2 %xmm15, %xmm8"); + asm volatile("sha256rnds2 (%rax), %xmm1"); + asm volatile("sha256rnds2 (%r8), %xmm1"); + asm volatile("sha256rnds2 (0x12345678), %xmm1"); + asm volatile("sha256rnds2 (%rax), %xmm3"); + asm volatile("sha256rnds2 (%rcx,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(,%rax,1), %xmm1"); + asm volatile("sha256rnds2 (%rax,%rcx,1), %xmm1"); + asm volatile("sha256rnds2 (%rax,%rcx,8), %xmm1"); + asm volatile("sha256rnds2 0x12(%rax), %xmm1"); + asm volatile("sha256rnds2 0x12(%rbp), %xmm1"); + asm volatile("sha256rnds2 0x12(%rcx,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%rbp,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%rax,%rcx,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%rax,%rcx,8), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rbp), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rcx,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rbp,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax,%rcx,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax,%rcx,8), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha256msg1 xmm2/m128, xmm1 */ + + asm volatile("sha256msg1 %xmm1, %xmm0"); + asm volatile("sha256msg1 %xmm7, %xmm2"); + asm volatile("sha256msg1 %xmm8, %xmm0"); + asm volatile("sha256msg1 %xmm7, %xmm8"); + asm volatile("sha256msg1 %xmm15, %xmm8"); + asm volatile("sha256msg1 (%rax), %xmm0"); + asm volatile("sha256msg1 (%r8), %xmm0"); + asm volatile("sha256msg1 (0x12345678), %xmm0"); + asm volatile("sha256msg1 (%rax), %xmm3"); + asm volatile("sha256msg1 (%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha256msg1 (%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg1 (%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg1 0x12(%rax), %xmm0"); + asm volatile("sha256msg1 0x12(%rbp), %xmm0"); + asm volatile("sha256msg1 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg1 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rbp), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha256msg2 xmm2/m128, xmm1 */ + + asm volatile("sha256msg2 %xmm1, %xmm0"); + asm volatile("sha256msg2 %xmm7, %xmm2"); + asm volatile("sha256msg2 %xmm8, %xmm0"); + asm volatile("sha256msg2 %xmm7, %xmm8"); + asm volatile("sha256msg2 %xmm15, %xmm8"); + asm volatile("sha256msg2 (%rax), %xmm0"); + asm volatile("sha256msg2 (%r8), %xmm0"); + asm volatile("sha256msg2 (0x12345678), %xmm0"); + asm volatile("sha256msg2 (%rax), %xmm3"); + asm volatile("sha256msg2 (%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha256msg2 (%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg2 (%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg2 0x12(%rax), %xmm0"); + asm volatile("sha256msg2 0x12(%rbp), %xmm0"); + asm volatile("sha256msg2 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg2 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rbp), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm15"); + #else /* #ifdef __x86_64__ */ /* bndmk m32, bnd */ @@ -407,6 +611,175 @@ int main(void) asm volatile("bnd jmp *(%ecx)"); /* Expecting: jmp indirect 0 */ asm volatile("bnd jne label1"); /* Expecting: jcc conditional 0xfffffffc */ + /* sha1rnds4 imm8, xmm2/m128, xmm1 */ + + asm volatile("sha1rnds4 $0x0, %xmm1, %xmm0"); + asm volatile("sha1rnds4 $0x91, %xmm7, %xmm2"); + asm volatile("sha1rnds4 $0x91, (%eax), %xmm0"); + asm volatile("sha1rnds4 $0x91, (0x12345678), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%eax), %xmm3"); + asm volatile("sha1rnds4 $0x91, (%ecx,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%eax,%ecx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%eax,%ecx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%eax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%ebp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%eax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%ebp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha1nexte xmm2/m128, xmm1 */ + + asm volatile("sha1nexte %xmm1, %xmm0"); + asm volatile("sha1nexte %xmm7, %xmm2"); + asm volatile("sha1nexte (%eax), %xmm0"); + asm volatile("sha1nexte (0x12345678), %xmm0"); + asm volatile("sha1nexte (%eax), %xmm3"); + asm volatile("sha1nexte (%ecx,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1nexte (%eax,%ecx,1), %xmm0"); + asm volatile("sha1nexte (%eax,%ecx,8), %xmm0"); + asm volatile("sha1nexte 0x12(%eax), %xmm0"); + asm volatile("sha1nexte 0x12(%ebp), %xmm0"); + asm volatile("sha1nexte 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1nexte 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1nexte 0x12345678(%eax), %xmm0"); + asm volatile("sha1nexte 0x12345678(%ebp), %xmm0"); + asm volatile("sha1nexte 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha1msg1 xmm2/m128, xmm1 */ + + asm volatile("sha1msg1 %xmm1, %xmm0"); + asm volatile("sha1msg1 %xmm7, %xmm2"); + asm volatile("sha1msg1 (%eax), %xmm0"); + asm volatile("sha1msg1 (0x12345678), %xmm0"); + asm volatile("sha1msg1 (%eax), %xmm3"); + asm volatile("sha1msg1 (%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1msg1 (%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg1 (%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg1 0x12(%eax), %xmm0"); + asm volatile("sha1msg1 0x12(%ebp), %xmm0"); + asm volatile("sha1msg1 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg1 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg1 0x12345678(%eax), %xmm0"); + asm volatile("sha1msg1 0x12345678(%ebp), %xmm0"); + asm volatile("sha1msg1 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha1msg2 xmm2/m128, xmm1 */ + + asm volatile("sha1msg2 %xmm1, %xmm0"); + asm volatile("sha1msg2 %xmm7, %xmm2"); + asm volatile("sha1msg2 (%eax), %xmm0"); + asm volatile("sha1msg2 (0x12345678), %xmm0"); + asm volatile("sha1msg2 (%eax), %xmm3"); + asm volatile("sha1msg2 (%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1msg2 (%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg2 (%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg2 0x12(%eax), %xmm0"); + asm volatile("sha1msg2 0x12(%ebp), %xmm0"); + asm volatile("sha1msg2 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg2 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg2 0x12345678(%eax), %xmm0"); + asm volatile("sha1msg2 0x12345678(%ebp), %xmm0"); + asm volatile("sha1msg2 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha256rnds2 , xmm2/m128, xmm1 */ + /* Note sha256rnds2 has an implicit operand 'xmm0' */ + + asm volatile("sha256rnds2 %xmm4, %xmm1"); + asm volatile("sha256rnds2 %xmm7, %xmm2"); + asm volatile("sha256rnds2 (%eax), %xmm1"); + asm volatile("sha256rnds2 (0x12345678), %xmm1"); + asm volatile("sha256rnds2 (%eax), %xmm3"); + asm volatile("sha256rnds2 (%ecx,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(,%eax,1), %xmm1"); + asm volatile("sha256rnds2 (%eax,%ecx,1), %xmm1"); + asm volatile("sha256rnds2 (%eax,%ecx,8), %xmm1"); + asm volatile("sha256rnds2 0x12(%eax), %xmm1"); + asm volatile("sha256rnds2 0x12(%ebp), %xmm1"); + asm volatile("sha256rnds2 0x12(%ecx,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%ebp,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%eax,%ecx,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%eax,%ecx,8), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%eax), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%ebp), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%ecx,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%ebp,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%eax,%ecx,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%eax,%ecx,8), %xmm1"); + + /* sha256msg1 xmm2/m128, xmm1 */ + + asm volatile("sha256msg1 %xmm1, %xmm0"); + asm volatile("sha256msg1 %xmm7, %xmm2"); + asm volatile("sha256msg1 (%eax), %xmm0"); + asm volatile("sha256msg1 (0x12345678), %xmm0"); + asm volatile("sha256msg1 (%eax), %xmm3"); + asm volatile("sha256msg1 (%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha256msg1 (%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg1 (%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg1 0x12(%eax), %xmm0"); + asm volatile("sha256msg1 0x12(%ebp), %xmm0"); + asm volatile("sha256msg1 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg1 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg1 0x12345678(%eax), %xmm0"); + asm volatile("sha256msg1 0x12345678(%ebp), %xmm0"); + asm volatile("sha256msg1 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha256msg2 xmm2/m128, xmm1 */ + + asm volatile("sha256msg2 %xmm1, %xmm0"); + asm volatile("sha256msg2 %xmm7, %xmm2"); + asm volatile("sha256msg2 (%eax), %xmm0"); + asm volatile("sha256msg2 (0x12345678), %xmm0"); + asm volatile("sha256msg2 (%eax), %xmm3"); + asm volatile("sha256msg2 (%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha256msg2 (%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg2 (%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg2 0x12(%eax), %xmm0"); + asm volatile("sha256msg2 0x12(%ebp), %xmm0"); + asm volatile("sha256msg2 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg2 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg2 0x12345678(%eax), %xmm0"); + asm volatile("sha256msg2 0x12345678(%ebp), %xmm0"); + asm volatile("sha256msg2 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%eax,%ecx,8), %xmm0"); + #endif /* #ifndef __x86_64__ */ /* Following line is a marker for the awk script - do not change */ diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index a02a195d219c..25dad388b371 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -736,6 +736,12 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff +c8: sha1nexte Vdq,Wdq +c9: sha1msg1 Vdq,Wdq +ca: sha1msg2 Vdq,Wdq +cb: sha256rnds2 Vdq,Wdq +cc: sha256msg1 Vdq,Wdq +cd: sha256msg2 Vdq,Wdq db: VAESIMC Vdq,Wdq (66),(v1) dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) @@ -794,6 +800,7 @@ AVXcode: 3 61: vpcmpestri Vdq,Wdq,Ib (66),(v1) 62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) 63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +cc: sha1rnds4 Vdq,Wdq,Ib df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) EndTable -- cgit v1.2.3 From ac1c8859a81e2fc45db1dbff30bdc572005734ca Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:29 +0300 Subject: x86/insn: perf tools: Add new memory instructions Intel Architecture Instruction Set Extensions Programing Reference (Oct 2014) describes 3 new memory instructions, namely clflushopt, clwb and pcommit. Add them to the op code map and the perf tools new instructions test. e.g. $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-6-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 4 +- tools/perf/tests/insn-x86-dat-32.c | 22 +++++++++++ tools/perf/tests/insn-x86-dat-64.c | 34 ++++++++++++++++ tools/perf/tests/insn-x86-dat-src.c | 46 ++++++++++++++++++++++ .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 4 +- 5 files changed, 106 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 25dad388b371..f4f0451a301e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -943,8 +943,8 @@ GrpTable: Grp15 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE 5: XRSTOR | lfence (11B) -6: XSAVEOPT | mfence (11B) -7: clflush | sfence (11B) +6: XSAVEOPT | clwb (66) | mfence (11B) +7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B) EndTable GrpTable: Grp16 diff --git a/tools/perf/tests/insn-x86-dat-32.c b/tools/perf/tests/insn-x86-dat-32.c index 83f5078e74e1..4b09b7e130a0 100644 --- a/tools/perf/tests/insn-x86-dat-32.c +++ b/tools/perf/tests/insn-x86-dat-32.c @@ -616,3 +616,25 @@ "0f 38 cd 84 08 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,1),%xmm0",}, {{0x0f, 0x38, 0xcd, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", "0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x66, 0x0f, 0xae, 0x38, }, 4, 0, "", "", +"66 0f ae 38 \tclflushopt (%eax)",}, +{{0x66, 0x0f, 0xae, 0x3d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"66 0f ae 3d 78 56 34 12 \tclflushopt 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae bc c8 78 56 34 12 \tclflushopt 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xae, 0x38, }, 3, 0, "", "", +"0f ae 38 \tclflush (%eax)",}, +{{0x0f, 0xae, 0xf8, }, 3, 0, "", "", +"0f ae f8 \tsfence ",}, +{{0x66, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"66 0f ae 30 \tclwb (%eax)",}, +{{0x66, 0x0f, 0xae, 0x35, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"66 0f ae 35 78 56 34 12 \tclwb 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae b4 c8 78 56 34 12 \tclwb 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xae, 0x30, }, 3, 0, "", "", +"0f ae 30 \txsaveopt (%eax)",}, +{{0x0f, 0xae, 0xf0, }, 3, 0, "", "", +"0f ae f0 \tmfence ",}, +{{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", +"66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-64.c b/tools/perf/tests/insn-x86-dat-64.c index 13f008588590..5da235a4414f 100644 --- a/tools/perf/tests/insn-x86-dat-64.c +++ b/tools/perf/tests/insn-x86-dat-64.c @@ -702,3 +702,37 @@ "0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm0",}, {{0x44, 0x0f, 0x38, 0xcd, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", "44 0f 38 cd bc c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x66, 0x0f, 0xae, 0x38, }, 4, 0, "", "", +"66 0f ae 38 \tclflushopt (%rax)",}, +{{0x66, 0x41, 0x0f, 0xae, 0x38, }, 5, 0, "", "", +"66 41 0f ae 38 \tclflushopt (%r8)",}, +{{0x66, 0x0f, 0xae, 0x3c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae 3c 25 78 56 34 12 \tclflushopt 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae bc c8 78 56 34 12 \tclflushopt 0x12345678(%rax,%rcx,8)",}, +{{0x66, 0x41, 0x0f, 0xae, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"66 41 0f ae bc c8 78 56 34 12 \tclflushopt 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xae, 0x38, }, 3, 0, "", "", +"0f ae 38 \tclflush (%rax)",}, +{{0x41, 0x0f, 0xae, 0x38, }, 4, 0, "", "", +"41 0f ae 38 \tclflush (%r8)",}, +{{0x0f, 0xae, 0xf8, }, 3, 0, "", "", +"0f ae f8 \tsfence ",}, +{{0x66, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"66 0f ae 30 \tclwb (%rax)",}, +{{0x66, 0x41, 0x0f, 0xae, 0x30, }, 5, 0, "", "", +"66 41 0f ae 30 \tclwb (%r8)",}, +{{0x66, 0x0f, 0xae, 0x34, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae 34 25 78 56 34 12 \tclwb 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae b4 c8 78 56 34 12 \tclwb 0x12345678(%rax,%rcx,8)",}, +{{0x66, 0x41, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"66 41 0f ae b4 c8 78 56 34 12 \tclwb 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xae, 0x30, }, 3, 0, "", "", +"0f ae 30 \txsaveopt (%rax)",}, +{{0x41, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"41 0f ae 30 \txsaveopt (%r8)",}, +{{0x0f, 0xae, 0xf0, }, 3, 0, "", "", +"0f ae f0 \tmfence ",}, +{{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", +"66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-src.c b/tools/perf/tests/insn-x86-dat-src.c index 7d06c9b22070..482637f44245 100644 --- a/tools/perf/tests/insn-x86-dat-src.c +++ b/tools/perf/tests/insn-x86-dat-src.c @@ -421,6 +421,30 @@ int main(void) asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm0"); asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm15"); + /* clflushopt m8 */ + + asm volatile("clflushopt (%rax)"); + asm volatile("clflushopt (%r8)"); + asm volatile("clflushopt (0x12345678)"); + asm volatile("clflushopt 0x12345678(%rax,%rcx,8)"); + asm volatile("clflushopt 0x12345678(%r8,%rcx,8)"); + /* Also check instructions in the same group encoding as clflushopt */ + asm volatile("clflush (%rax)"); + asm volatile("clflush (%r8)"); + asm volatile("sfence"); + + /* clwb m8 */ + + asm volatile("clwb (%rax)"); + asm volatile("clwb (%r8)"); + asm volatile("clwb (0x12345678)"); + asm volatile("clwb 0x12345678(%rax,%rcx,8)"); + asm volatile("clwb 0x12345678(%r8,%rcx,8)"); + /* Also check instructions in the same group encoding as clwb */ + asm volatile("xsaveopt (%rax)"); + asm volatile("xsaveopt (%r8)"); + asm volatile("mfence"); + #else /* #ifdef __x86_64__ */ /* bndmk m32, bnd */ @@ -780,8 +804,30 @@ int main(void) asm volatile("sha256msg2 0x12345678(%eax,%ecx,1), %xmm0"); asm volatile("sha256msg2 0x12345678(%eax,%ecx,8), %xmm0"); + /* clflushopt m8 */ + + asm volatile("clflushopt (%eax)"); + asm volatile("clflushopt (0x12345678)"); + asm volatile("clflushopt 0x12345678(%eax,%ecx,8)"); + /* Also check instructions in the same group encoding as clflushopt */ + asm volatile("clflush (%eax)"); + asm volatile("sfence"); + + /* clwb m8 */ + + asm volatile("clwb (%eax)"); + asm volatile("clwb (0x12345678)"); + asm volatile("clwb 0x12345678(%eax,%ecx,8)"); + /* Also check instructions in the same group encoding as clwb */ + asm volatile("xsaveopt (%eax)"); + asm volatile("mfence"); + #endif /* #ifndef __x86_64__ */ + /* pcommit */ + + asm volatile("pcommit"); + /* Following line is a marker for the awk script - do not change */ asm volatile("rdtsc"); /* Stop here */ diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 25dad388b371..f4f0451a301e 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -943,8 +943,8 @@ GrpTable: Grp15 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE 5: XRSTOR | lfence (11B) -6: XSAVEOPT | mfence (11B) -7: clflush | sfence (11B) +6: XSAVEOPT | clwb (66) | mfence (11B) +7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B) EndTable GrpTable: Grp16 -- cgit v1.2.3 From 978260cdbec3e34a3dfb2277ffc0aa1809457362 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:30 +0300 Subject: x86/insn: perf tools: Add new memory protection keys instructions Add rdpkru and wrpkru to the op code map and the perf tools new instructions test. In the case of the test, only the bytes can be tested at the moment since binutils doesn't support the instructions yet. To run the test: $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" 2>&1 | grep pkru For information about rdpkru and wrpkru, refer the Intel SDM. Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-7-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 2 +- tools/perf/tests/insn-x86.c | 4 ++++ tools/perf/util/intel-pt-decoder/x86-opcode-map.txt | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index f4f0451a301e..5a9705ed9139 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -885,7 +885,7 @@ GrpTable: Grp7 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv -5: +5: rdpkru (110),(11B) | wrpkru (111),(11B) 6: LMSW Ew 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) EndTable diff --git a/tools/perf/tests/insn-x86.c b/tools/perf/tests/insn-x86.c index 0e126a099874..5c49eec81349 100644 --- a/tools/perf/tests/insn-x86.c +++ b/tools/perf/tests/insn-x86.c @@ -17,11 +17,15 @@ struct test_data { struct test_data test_data_32[] = { #include "insn-x86-dat-32.c" + {{0x0f, 0x01, 0xee}, 3, 0, NULL, NULL, "0f 01 ee \trdpkru"}, + {{0x0f, 0x01, 0xef}, 3, 0, NULL, NULL, "0f 01 ef \twrpkru"}, {{0}, 0, 0, NULL, NULL, NULL}, }; struct test_data test_data_64[] = { #include "insn-x86-dat-64.c" + {{0x0f, 0x01, 0xee}, 3, 0, NULL, NULL, "0f 01 ee \trdpkru"}, + {{0x0f, 0x01, 0xef}, 3, 0, NULL, NULL, "0f 01 ef \twrpkru"}, {{0}, 0, 0, NULL, NULL, NULL}, }; diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index f4f0451a301e..5a9705ed9139 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -885,7 +885,7 @@ GrpTable: Grp7 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv -5: +5: rdpkru (110),(11B) | wrpkru (111),(11B) 6: LMSW Ew 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) EndTable -- cgit v1.2.3 From f83b6b64eba155cfb43ab8a5d9c422c3e7f603e6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:31 +0300 Subject: x86/insn: perf tools: Add new xsave instructions Add xsavec, xsaves and xrstors to the op code map and the perf tools new instructions test. To run the test: $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" 2>&1 | grep 'xsave\|xrst' For information about xsavec, xsaves and xrstors, refer the Intel SDM. Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-8-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 3 ++ tools/perf/tests/insn-x86-dat-32.c | 18 ++++++++++ tools/perf/tests/insn-x86-dat-64.c | 30 ++++++++++++++++ tools/perf/tests/insn-x86-dat-src.c | 42 ++++++++++++++++++++++ .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 3 ++ 5 files changed, 96 insertions(+) (limited to 'arch') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5a9705ed9139..d388de72eaca 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -899,6 +899,9 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq +3: xrstors +4: xsavec +5: xsaves 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) 7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable diff --git a/tools/perf/tests/insn-x86-dat-32.c b/tools/perf/tests/insn-x86-dat-32.c index 4b09b7e130a0..3b491cfe204e 100644 --- a/tools/perf/tests/insn-x86-dat-32.c +++ b/tools/perf/tests/insn-x86-dat-32.c @@ -636,5 +636,23 @@ "0f ae 30 \txsaveopt (%eax)",}, {{0x0f, 0xae, 0xf0, }, 3, 0, "", "", "0f ae f0 \tmfence ",}, +{{0x0f, 0xc7, 0x20, }, 3, 0, "", "", +"0f c7 20 \txsavec (%eax)",}, +{{0x0f, 0xc7, 0x25, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "", "", +"0f c7 25 78 56 34 12 \txsavec 0x12345678",}, +{{0x0f, 0xc7, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 a4 c8 78 56 34 12 \txsavec 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xc7, 0x28, }, 3, 0, "", "", +"0f c7 28 \txsaves (%eax)",}, +{{0x0f, 0xc7, 0x2d, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "", "", +"0f c7 2d 78 56 34 12 \txsaves 0x12345678",}, +{{0x0f, 0xc7, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 ac c8 78 56 34 12 \txsaves 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xc7, 0x18, }, 3, 0, "", "", +"0f c7 18 \txrstors (%eax)",}, +{{0x0f, 0xc7, 0x1d, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "", "", +"0f c7 1d 78 56 34 12 \txrstors 0x12345678",}, +{{0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%eax,%ecx,8)",}, {{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", "66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-64.c b/tools/perf/tests/insn-x86-dat-64.c index 5da235a4414f..4fe7cce179c4 100644 --- a/tools/perf/tests/insn-x86-dat-64.c +++ b/tools/perf/tests/insn-x86-dat-64.c @@ -734,5 +734,35 @@ "41 0f ae 30 \txsaveopt (%r8)",}, {{0x0f, 0xae, 0xf0, }, 3, 0, "", "", "0f ae f0 \tmfence ",}, +{{0x0f, 0xc7, 0x20, }, 3, 0, "", "", +"0f c7 20 \txsavec (%rax)",}, +{{0x41, 0x0f, 0xc7, 0x20, }, 4, 0, "", "", +"41 0f c7 20 \txsavec (%r8)",}, +{{0x0f, 0xc7, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 24 25 78 56 34 12 \txsavec 0x12345678",}, +{{0x0f, 0xc7, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 a4 c8 78 56 34 12 \txsavec 0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0xc7, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"41 0f c7 a4 c8 78 56 34 12 \txsavec 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xc7, 0x28, }, 3, 0, "", "", +"0f c7 28 \txsaves (%rax)",}, +{{0x41, 0x0f, 0xc7, 0x28, }, 4, 0, "", "", +"41 0f c7 28 \txsaves (%r8)",}, +{{0x0f, 0xc7, 0x2c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 2c 25 78 56 34 12 \txsaves 0x12345678",}, +{{0x0f, 0xc7, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 ac c8 78 56 34 12 \txsaves 0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0xc7, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"41 0f c7 ac c8 78 56 34 12 \txsaves 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xc7, 0x18, }, 3, 0, "", "", +"0f c7 18 \txrstors (%rax)",}, +{{0x41, 0x0f, 0xc7, 0x18, }, 4, 0, "", "", +"41 0f c7 18 \txrstors (%r8)",}, +{{0x0f, 0xc7, 0x1c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 1c 25 78 56 34 12 \txrstors 0x12345678",}, +{{0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"41 0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%r8,%rcx,8)",}, {{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", "66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-src.c b/tools/perf/tests/insn-x86-dat-src.c index 482637f44245..41b1b1c62660 100644 --- a/tools/perf/tests/insn-x86-dat-src.c +++ b/tools/perf/tests/insn-x86-dat-src.c @@ -445,6 +445,30 @@ int main(void) asm volatile("xsaveopt (%r8)"); asm volatile("mfence"); + /* xsavec mem */ + + asm volatile("xsavec (%rax)"); + asm volatile("xsavec (%r8)"); + asm volatile("xsavec (0x12345678)"); + asm volatile("xsavec 0x12345678(%rax,%rcx,8)"); + asm volatile("xsavec 0x12345678(%r8,%rcx,8)"); + + /* xsaves mem */ + + asm volatile("xsaves (%rax)"); + asm volatile("xsaves (%r8)"); + asm volatile("xsaves (0x12345678)"); + asm volatile("xsaves 0x12345678(%rax,%rcx,8)"); + asm volatile("xsaves 0x12345678(%r8,%rcx,8)"); + + /* xrstors mem */ + + asm volatile("xrstors (%rax)"); + asm volatile("xrstors (%r8)"); + asm volatile("xrstors (0x12345678)"); + asm volatile("xrstors 0x12345678(%rax,%rcx,8)"); + asm volatile("xrstors 0x12345678(%r8,%rcx,8)"); + #else /* #ifdef __x86_64__ */ /* bndmk m32, bnd */ @@ -822,6 +846,24 @@ int main(void) asm volatile("xsaveopt (%eax)"); asm volatile("mfence"); + /* xsavec mem */ + + asm volatile("xsavec (%eax)"); + asm volatile("xsavec (0x12345678)"); + asm volatile("xsavec 0x12345678(%eax,%ecx,8)"); + + /* xsaves mem */ + + asm volatile("xsaves (%eax)"); + asm volatile("xsaves (0x12345678)"); + asm volatile("xsaves 0x12345678(%eax,%ecx,8)"); + + /* xrstors mem */ + + asm volatile("xrstors (%eax)"); + asm volatile("xrstors (0x12345678)"); + asm volatile("xrstors 0x12345678(%eax,%ecx,8)"); + #endif /* #ifndef __x86_64__ */ /* pcommit */ diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 5a9705ed9139..d388de72eaca 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -899,6 +899,9 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq +3: xrstors +4: xsavec +5: xsaves 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) 7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable -- cgit v1.2.3 From b20112edeadf0b8a1416de061caa4beb11539902 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 Aug 2015 12:05:18 +0300 Subject: perf/x86: Improve accuracy of perf/sched clock When TSC is stable perf/sched clock is based on it. However the conversion from cycles to nanoseconds is not as accurate as it could be. Because CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048 The change is to calculate the maximum shift that results in a multiplier that is still a 32-bit number. For example all frequencies over 1 GHz will have a shift of 32, making the accuracy of the conversion +/- 1/(2^33). That is achieved by using the 'clocks_calc_mult_shift()' function. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1440147918-22250-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c8d52cb4cb6e..39ec3d07affd 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div - * into a shift. + * into a shift. The larger SC is, the more accurate the conversion, but + * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication + * (64-bit result) can be used. * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - static void cyc2ns_data_init(struct cyc2ns_data *data) { data->cyc2ns_mul = 0; - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; data->__count = 0; } @@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) if (likely(data == tail)) { ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); } else { data->__count++; barrier(); ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); barrier(); @@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - data->cyc2ns_mul = - DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, - cpu_khz); - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, + NSEC_PER_MSEC, 0); + data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); cyc2ns_write_end(cpu, data); -- cgit v1.2.3 From a09d31f45224c219a4d5c728fa40150dc9d3e3e5 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 31 Aug 2015 17:09:27 +0300 Subject: perf/x86/intel/ds: Work around BTS leaking kernel addresses BTS leaks kernel addresses even in userspace-only mode due to imprecise IP sampling, so sometimes syscall entry points or page fault handler addresses end up in a userspace trace. Since this driver uses a relatively small buffer for BTS records and it has to iterate through them anyway, it can also take on the additional job of filtering out the records that contain kernel addresses when kernel space tracing is not enabled. This patch changes the bts code to skip the offending records from perf output. In order to request the exact amount of space on the ring buffer, we need to do an extra pass through the records to know how many there are of the valid ones, but considering the small size of the buffer, this extra pass adds very little overhead to the nmi handler. This way we won't end up with awkward IP samples with zero IPs in the perf stream. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1441030168-6853-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 40 ++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 84f236ab96b0..5db1c7755548 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -510,10 +510,11 @@ int intel_pmu_drain_bts_buffer(void) u64 flags; }; struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; + struct bts_record *at, *base, *top; struct perf_output_handle handle; struct perf_event_header header; struct perf_sample_data data; + unsigned long skip = 0; struct pt_regs regs; if (!event) @@ -522,10 +523,10 @@ int intel_pmu_drain_bts_buffer(void) if (!x86_pmu.bts_active) return 0; - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; + base = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; - if (top <= at) + if (top <= base) return 0; memset(®s, 0, sizeof(regs)); @@ -534,6 +535,27 @@ int intel_pmu_drain_bts_buffer(void) perf_sample_data_init(&data, 0, event->hw.last_period); + /* + * BTS leaks kernel addresses in branches across the cpl boundary, + * such as traps or system calls, so unless the user is asking for + * kernel tracing (and right now it's not possible), we'd need to + * filter them out. But first we need to count how many of those we + * have in the current batch. This is an extra O(n) pass, however, + * it's much faster than the other one especially considering that + * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the + * alloc_bts_buffer()). + */ + for (at = base; at < top; at++) { + /* + * Note that right now *this* BTS code only works if + * attr::exclude_kernel is set, but let's keep this extra + * check here in case that changes. + */ + if (event->attr.exclude_kernel && + (kernel_ip(at->from) || kernel_ip(at->to))) + skip++; + } + /* * Prepare a generic sample, i.e. fill in the invariant fields. * We will overwrite the from and to address before we output @@ -541,10 +563,16 @@ int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at))) + if (perf_output_begin(&handle, event, header.size * + (top - base - skip))) return 1; - for (; at < top; at++) { + for (at = base; at < top; at++) { + /* Filter out any records that contain kernel addresses. */ + if (event->attr.exclude_kernel && + (kernel_ip(at->from) || kernel_ip(at->to))) + continue; + data.ip = at->from; data.addr = at->to; -- cgit v1.2.3 From d2878d642a4edd1d57c691dc3e4d7847cbf9d442 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 31 Aug 2015 17:09:28 +0300 Subject: perf/x86/intel/bts: Disallow use by unprivileged users on paranoid systems BTS leaks kernel addresses even in userspace-only mode due to imprecise IP sampling, so sometimes syscall entry points or page fault handler addresses end up in a userspace trace. Now, intel_bts driver exports trace data zero-copy, it does not scan through it to filter out the kernel addresses and it's would be a O(n) job. To work around this situation, this patch forbids the use of intel_bts driver by unprivileged users on systems with the paranoid setting above the (kernel's) default "1", which still allows kernel profiling. In other words, using intel_bts driver implies kernel tracing, regardless of the "exclude_kernel" attribute setting. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1441030168-6853-3-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_bts.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index d1c0f254afbe..2cad71d1b14c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -495,6 +495,19 @@ static int bts_event_init(struct perf_event *event) if (x86_add_exclusive(x86_lbr_exclusive_bts)) return -EBUSY; + /* + * BTS leaks kernel addresses even when CPL0 tracing is + * disabled, so disallow intel_bts driver for unprivileged + * users on paranoid systems since it provides trace data + * to the user in a zero-copy fashion. + * + * Note that the default paranoia setting permits unprivileged + * users to profile the kernel. + */ + if (event->attr.exclude_kernel && perf_paranoid_kernel() && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + ret = x86_reserve_hardware(); if (ret) { x86_del_exclusive(x86_lbr_exclusive_bts); -- cgit v1.2.3 From deb27519bf1f4b21a761c0675dbdf1196df7d72a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 17 Aug 2015 08:37:31 -0400 Subject: perf/x86/intel: Fix LBR callstack issue caused by FREEZE_LBRS_ON_PMI This patch fixes an issue which introduced by commit 1a78d93750bb5f61abdc59a91fc3bd06a214542a ("perf/x86/intel: Streamline LBR MSR handling in PMI"). The old patch not only avoids writing LBR_SELECT MSR in PMI, but also avoids updating lbr_select variable. So in PMI, FREEZE_LBRS_ON_PMI bit is always mistakenly set for IA32_DEBUGCTLMSR MSR, which causes superfluous increase/decrease of LBR_TOS when collecting LBR callstack. Reported-by: Milian Wolff Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1439815051-8616-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index b2c9475b7ff2..a1d07c71d3b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -151,10 +151,9 @@ static void __intel_pmu_lbr_enable(bool pmi) * No need to reprogram LBR_SELECT in a PMI, as it * did not change. */ - if (cpuc->lbr_sel && !pmi) { - lbr_select = cpuc->lbr_sel->config; + lbr_select = cpuc->lbr_sel->config; + if (cpuc->lbr_sel && !pmi) wrmsrl(MSR_LBR_SELECT, lbr_select); - } rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; -- cgit v1.2.3 From 73fdeb66592ee80dffb16fb8a9b7378a00c1a826 Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Mon, 31 Aug 2015 16:21:02 +0800 Subject: perf/x86/intel/pt: Fix KVM warning due to doing rdmsr() before the CPUID test If KVM does not support INTEL_PT, guest MSR_IA32_RTIT_CTL reading will produce host warning like "kvm [2469]: vcpu0 unhandled rdmsr: 0x570". Guest can determine whether the CPU supports Intel_PT according to CPUID, so test_cpu_cap function is added before rdmsr,and it is more in line with the code style. Signed-off-by: Huaitong Han Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Link: http://lkml.kernel.org/r/1441009262-9792-1-git-send-email-huaitong.han@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 42169283448b..868e1194337f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -139,9 +139,6 @@ static int __init pt_pmu_hw_init(void) long i; attrs = NULL; - ret = -ENODEV; - if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) - goto fail; for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, @@ -1130,6 +1127,10 @@ static __init int pt_init(void) int ret, cpu, prior_warn = 0; BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); + + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + return -ENODEV; + get_online_cpus(); for_each_online_cpu(cpu) { u64 ctl; -- cgit v1.2.3 From 845583767c306dac0290aab908c18b01772ea4b4 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:44 -0700 Subject: sparc, perf/sparc: Remove unnecessary assignment In ->commit_txn() 'cpuc' is already initialized when it is declared, so we can remove the duplicate assignment. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1441336073-22750-2-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/sparc/kernel/perf_event.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 689db65f8529..e3f89c7e5062 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1528,7 +1528,6 @@ static int sparc_pmu_commit_txn(struct pmu *pmu) if (!sparc_pmu) return -EINVAL; - cpuc = this_cpu_ptr(&cpu_hw_events); n = cpuc->n_events; if (check_excludes(cpuc->event, 0, n)) return -EINVAL; -- cgit v1.2.3 From fbbe07011581990ef74dfac06dc8511b1a14badb Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:45 -0700 Subject: perf/core: Add a 'flags' parameter to the PMU transactional interfaces Currently, the PMU interface allows reading only one counter at a time. But some PMUs like the 24x7 counters in Power, support reading several counters at once. To leveage this functionality, extend the transaction interface to support a "transaction type". The first type, PERF_PMU_TXN_ADD, refers to the existing transactions, i.e. used to _schedule_ all the events on the PMU as a group. A second transaction type, PERF_PMU_TXN_READ, will be used in a follow-on patch, by the 24x7 counters to read several counters at once. Extend the transaction interfaces to the PMU to accept a 'txn_flags' parameter and use this parameter to ignore any transactions that are not of type PERF_PMU_TXN_ADD. Thanks to Peter Zijlstra for his input. Signed-off-by: Sukadev Bhattiprolu [peterz: s390 compile fix] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-3-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/powerpc/perf/core-book3s.c | 30 +++++++++++++++++++++++++++++- arch/s390/kernel/perf_cpum_cf.c | 30 +++++++++++++++++++++++++++++- arch/sparc/kernel/perf_event.c | 25 ++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.c | 32 +++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.h | 1 + include/linux/perf_event.h | 14 +++++++++++--- kernel/events/core.c | 31 ++++++++++++++++++++++++++++--- 7 files changed, 153 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index b0382f3f1095..c84074185c80 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -49,6 +49,7 @@ struct cpu_hw_events { unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned int group_flag; + unsigned int txn_flags; int n_txn_start; /* BHRB bits */ @@ -1586,11 +1587,21 @@ static void power_pmu_stop(struct perf_event *event, int ef_flags) * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void power_pmu_start_txn(struct pmu *pmu) +static void power_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; @@ -1604,6 +1615,14 @@ static void power_pmu_start_txn(struct pmu *pmu) static void power_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + unsigned int txn_flags; + + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); @@ -1621,7 +1640,15 @@ static int power_pmu_commit_txn(struct pmu *pmu) if (!ppmu) return -EAGAIN; + cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuhw->txn_flags = 0; + return 0; + } + n = cpuhw->n_events; if (check_excludes(cpuhw->event, cpuhw->flags, 0, n)) return -EAGAIN; @@ -1633,6 +1660,7 @@ static int power_pmu_commit_txn(struct pmu *pmu) cpuhw->event[i]->hw.config = cpuhw->events[i]; cpuhw->group_flag &= ~PERF_EVENT_TXN; + cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 56fdad479115..19138be412d6 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -72,6 +72,7 @@ struct cpu_hw_events { atomic_t ctr_set[CPUMF_CTR_SET_MAX]; u64 state, tx_state; unsigned int flags; + unsigned int txn_flags; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .ctr_set = { @@ -82,6 +83,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { }, .state = 0, .flags = 0, + .txn_flags = 0, }; static int get_counter_set(u64 event) @@ -572,11 +574,21 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) /* * Start group events scheduling transaction. * Set flags to perform a single test at commit time. + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void cpumf_pmu_start_txn(struct pmu *pmu) +static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); cpuhw->flags |= PERF_EVENT_TXN; cpuhw->tx_state = cpuhw->state; @@ -589,8 +601,16 @@ static void cpumf_pmu_start_txn(struct pmu *pmu) */ static void cpumf_pmu_cancel_txn(struct pmu *pmu) { + unsigned int txn_flags; struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + WARN_ON(cpuhw->tx_state != cpuhw->state); cpuhw->flags &= ~PERF_EVENT_TXN; @@ -607,6 +627,13 @@ static int cpumf_pmu_commit_txn(struct pmu *pmu) struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); u64 state; + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuhw->txn_flags = 0; + return 0; + } + /* check if the updated state can be scheduled */ state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); state >>= CPUMF_LCCTL_ENABLE_SHIFT; @@ -614,6 +641,7 @@ static int cpumf_pmu_commit_txn(struct pmu *pmu) return -EPERM; cpuhw->flags &= ~PERF_EVENT_TXN; + cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index e3f89c7e5062..2c0984d146ec 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -109,6 +109,7 @@ struct cpu_hw_events { int enabled; unsigned int group_flag; + unsigned int txn_flags; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -1494,10 +1495,16 @@ static int sparc_pmu_event_init(struct perf_event *event) * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time */ -static void sparc_pmu_start_txn(struct pmu *pmu) +static void sparc_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); cpuhw->group_flag |= PERF_EVENT_TXN; } @@ -1510,6 +1517,14 @@ static void sparc_pmu_start_txn(struct pmu *pmu) static void sparc_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + unsigned int txn_flags; + + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); @@ -1528,6 +1543,13 @@ static int sparc_pmu_commit_txn(struct pmu *pmu) if (!sparc_pmu) return -EINVAL; + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuc->txn_flags = 0; + return 0; + } + n = cpuc->n_events; if (check_excludes(cpuc->event, 0, n)) return -EINVAL; @@ -1535,6 +1557,7 @@ static int sparc_pmu_commit_txn(struct pmu *pmu) return -EAGAIN; cpuc->group_flag &= ~PERF_EVENT_TXN; + cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66dd3fe99b82..dc77ef470e0e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1748,9 +1748,21 @@ static inline void x86_pmu_read(struct perf_event *event) * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void x86_pmu_start_txn(struct pmu *pmu) +static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ + + cpuc->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); __this_cpu_write(cpu_hw_events.n_txn, 0); @@ -1763,6 +1775,16 @@ static void x86_pmu_start_txn(struct pmu *pmu) */ static void x86_pmu_cancel_txn(struct pmu *pmu) { + unsigned int txn_flags; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + txn_flags = cpuc->txn_flags; + cpuc->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* * Truncate collected array by the number of events added in this @@ -1786,6 +1808,13 @@ static int x86_pmu_commit_txn(struct pmu *pmu) int assign[X86_PMC_IDX_MAX]; int n, ret; + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuc->txn_flags = 0; + return 0; + } + n = cpuc->n_events; if (!x86_pmu_initialized()) @@ -1802,6 +1831,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->group_flag &= ~PERF_EVENT_TXN; + cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 5edf6d868fc1..c99c93b9e348 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -196,6 +196,7 @@ struct cpu_hw_events { int n_excl; /* the number of exclusive events */ unsigned int group_flag; + unsigned int txn_flags; int is_fake; /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d61ee4459eee..ea3b5dd21a4c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -201,6 +201,8 @@ struct perf_event; */ #define PERF_EVENT_TXN 0x1 +#define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ + /** * pmu::capabilities flags */ @@ -331,20 +333,26 @@ struct pmu { * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. + * + * Optional. */ - void (*start_txn) (struct pmu *pmu); /* optional */ + void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. + * + * Optional. */ - int (*commit_txn) (struct pmu *pmu); /* optional */ + int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. + * + * Optional. */ - void (*cancel_txn) (struct pmu *pmu); /* optional */ + void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, diff --git a/kernel/events/core.c b/kernel/events/core.c index 76e64be9bfb5..c80cee82959f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1914,7 +1914,7 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - pmu->start_txn(pmu); + pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); @@ -7267,24 +7267,49 @@ static void perf_pmu_nop_void(struct pmu *pmu) { } +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) +{ +} + static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } -static void perf_pmu_start_txn(struct pmu *pmu) +DEFINE_PER_CPU(unsigned int, nop_txn_flags); + +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { + __this_cpu_write(nop_txn_flags, flags); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return 0; + perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_enable(pmu); } @@ -7523,7 +7548,7 @@ got_cpu_context: pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { - pmu->start_txn = perf_pmu_nop_void; + pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } -- cgit v1.2.3 From 88a486132dffeb010d659e00361c4d114de09972 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:52 -0700 Subject: powerpc, perf/powerpc/hv-24x7: Use PMU_TXN_READ interface The 24x7 counters in Powerpc allow monitoring a large number of counters simultaneously. They also allow reading several counters in a single HCALL so we can get a more consistent snapshot of the system. Use the PMU's transaction interface to monitor and read several event counters at once. The idea is that users can group several 24x7 events into a single group of events. We use the following logic to submit the group of events to the PMU and read the values: pmu->start_txn() // Initialize before first event for each event in group pmu->read(event); // Queue each event to be read pmu->commit_txn() // Read/update all queuedcounters The ->commit_txn() also updates the event counts in the respective perf_event objects. The perf subsystem can then directly get the event counts from the perf_event and can avoid submitting a new ->read() request to the PMU. Thanks to input from Peter Zijlstra. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-10-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/powerpc/perf/hv-24x7.c | 166 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 164 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 527c8b98e97e..9f9dfda9ed2c 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -142,6 +142,15 @@ static struct attribute_group event_long_desc_group = { static struct kmem_cache *hv_page_cache; +DEFINE_PER_CPU(int, hv_24x7_txn_flags); +DEFINE_PER_CPU(int, hv_24x7_txn_err); + +struct hv_24x7_hw { + struct perf_event *events[255]; +}; + +DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); + /* * request_buffer and result_buffer are not required to be 4k aligned, * but are not allowed to cross any 4k boundary. Aligning them to 4k is @@ -1231,9 +1240,48 @@ static void update_event_count(struct perf_event *event, u64 now) static void h_24x7_event_read(struct perf_event *event) { u64 now; + struct hv_24x7_request_buffer *request_buffer; + struct hv_24x7_hw *h24x7hw; + int txn_flags; + + txn_flags = __this_cpu_read(hv_24x7_txn_flags); + + /* + * If in a READ transaction, add this counter to the list of + * counters to read during the next HCALL (i.e commit_txn()). + * If not in a READ transaction, go ahead and make the HCALL + * to read this counter by itself. + */ + + if (txn_flags & PERF_PMU_TXN_READ) { + int i; + int ret; - now = h_24x7_get_value(event); - update_event_count(event, now); + if (__this_cpu_read(hv_24x7_txn_err)) + return; + + request_buffer = (void *)get_cpu_var(hv_24x7_reqb); + + ret = add_event_to_24x7_request(event, request_buffer); + if (ret) { + __this_cpu_write(hv_24x7_txn_err, ret); + } else { + /* + * Assoicate the event with the HCALL request index, + * so ->commit_txn() can quickly find/update count. + */ + i = request_buffer->num_requests - 1; + + h24x7hw = &get_cpu_var(hv_24x7_hw); + h24x7hw->events[i] = event; + put_cpu_var(h24x7hw); + } + + put_cpu_var(hv_24x7_reqb); + } else { + now = h_24x7_get_value(event); + update_event_count(event, now); + } } static void h_24x7_event_start(struct perf_event *event, int flags) @@ -1255,6 +1303,117 @@ static int h_24x7_event_add(struct perf_event *event, int flags) return 0; } +/* + * 24x7 counters only support READ transactions. They are + * always counting and dont need/support ADD transactions. + * Cache the flags, but otherwise ignore transactions that + * are not PERF_PMU_TXN_READ. + */ +static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags) +{ + struct hv_24x7_request_buffer *request_buffer; + struct hv_24x7_data_result_buffer *result_buffer; + + /* We should not be called if we are already in a txn */ + WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags)); + + __this_cpu_write(hv_24x7_txn_flags, flags); + if (flags & ~PERF_PMU_TXN_READ) + return; + + request_buffer = (void *)get_cpu_var(hv_24x7_reqb); + result_buffer = (void *)get_cpu_var(hv_24x7_resb); + + init_24x7_request(request_buffer, result_buffer); + + put_cpu_var(hv_24x7_resb); + put_cpu_var(hv_24x7_reqb); +} + +/* + * Clean up transaction state. + * + * NOTE: Ignore state of request and result buffers for now. + * We will initialize them during the next read/txn. + */ +static void reset_txn(void) +{ + __this_cpu_write(hv_24x7_txn_flags, 0); + __this_cpu_write(hv_24x7_txn_err, 0); +} + +/* + * 24x7 counters only support READ transactions. They are always counting + * and dont need/support ADD transactions. Clear ->txn_flags but otherwise + * ignore transactions that are not of type PERF_PMU_TXN_READ. + * + * For READ transactions, submit all pending 24x7 requests (i.e requests + * that were queued by h_24x7_event_read()), to the hypervisor and update + * the event counts. + */ +static int h_24x7_event_commit_txn(struct pmu *pmu) +{ + struct hv_24x7_request_buffer *request_buffer; + struct hv_24x7_data_result_buffer *result_buffer; + struct hv_24x7_result *resb; + struct perf_event *event; + u64 count; + int i, ret, txn_flags; + struct hv_24x7_hw *h24x7hw; + + txn_flags = __this_cpu_read(hv_24x7_txn_flags); + WARN_ON_ONCE(!txn_flags); + + ret = 0; + if (txn_flags & ~PERF_PMU_TXN_READ) + goto out; + + ret = __this_cpu_read(hv_24x7_txn_err); + if (ret) + goto out; + + request_buffer = (void *)get_cpu_var(hv_24x7_reqb); + result_buffer = (void *)get_cpu_var(hv_24x7_resb); + + ret = make_24x7_request(request_buffer, result_buffer); + if (ret) { + log_24x7_hcall(request_buffer, result_buffer, ret); + goto put_reqb; + } + + h24x7hw = &get_cpu_var(hv_24x7_hw); + + /* Update event counts from hcall */ + for (i = 0; i < request_buffer->num_requests; i++) { + resb = &result_buffer->results[i]; + count = be64_to_cpu(resb->elements[0].element_data[0]); + event = h24x7hw->events[i]; + h24x7hw->events[i] = NULL; + update_event_count(event, count); + } + + put_cpu_var(hv_24x7_hw); + +put_reqb: + put_cpu_var(hv_24x7_resb); + put_cpu_var(hv_24x7_reqb); +out: + reset_txn(); + return ret; +} + +/* + * 24x7 counters only support READ transactions. They are always counting + * and dont need/support ADD transactions. However, regardless of type + * of transaction, all we need to do is cleanup, so we don't have to check + * the type of transaction. + */ +static void h_24x7_event_cancel_txn(struct pmu *pmu) +{ + WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags)); + reset_txn(); +} + static struct pmu h_24x7_pmu = { .task_ctx_nr = perf_invalid_context, @@ -1266,6 +1425,9 @@ static struct pmu h_24x7_pmu = { .start = h_24x7_event_start, .stop = h_24x7_event_stop, .read = h_24x7_event_read, + .start_txn = h_24x7_event_start_txn, + .commit_txn = h_24x7_event_commit_txn, + .cancel_txn = h_24x7_event_cancel_txn, }; static int hv_24x7_init(void) -- cgit v1.2.3 From 8f3e5684d3fbd91ead283916676fa3dac22615e5 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:53 -0700 Subject: perf/core: Drop PERF_EVENT_TXN We currently use PERF_EVENT_TXN flag to determine if we are in the middle of a transaction. If in a transaction, we defer the schedulability checks from pmu->add() operation to the pmu->commit() operation. Now that we have "transaction types" (PERF_PMU_TXN_ADD, PERF_PMU_TXN_READ) we can use the type to determine if we are in a transaction and drop the PERF_EVENT_TXN flag. When PERF_EVENT_TXN is dropped, the cpuhw->group_flag on some architectures becomes unused, so drop that field as well. This is an extension of the Powerpc patch from Peter Zijlstra to s390, Sparc and x86 architectures. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-11-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/powerpc/perf/core-book3s.c | 6 +----- arch/s390/kernel/perf_cpum_cf.c | 5 +---- arch/sparc/kernel/perf_event.c | 6 +----- arch/x86/kernel/cpu/perf_event.c | 7 ++----- arch/x86/kernel/cpu/perf_event.h | 1 - include/linux/perf_event.h | 2 -- 6 files changed, 5 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index c84074185c80..d1e65ce545b3 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -48,7 +48,6 @@ struct cpu_hw_events { unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; - unsigned int group_flag; unsigned int txn_flags; int n_txn_start; @@ -1442,7 +1441,7 @@ static int power_pmu_add(struct perf_event *event, int ef_flags) * skip the schedulability test here, it will be performed * at commit time(->commit_txn) as a whole */ - if (cpuhw->group_flag & PERF_EVENT_TXN) + if (cpuhw->txn_flags & PERF_PMU_TXN_ADD) goto nocheck; if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) @@ -1603,7 +1602,6 @@ static void power_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; } @@ -1624,7 +1622,6 @@ static void power_pmu_cancel_txn(struct pmu *pmu) if (txn_flags & ~PERF_PMU_TXN_ADD) return; - cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); } @@ -1659,7 +1656,6 @@ static int power_pmu_commit_txn(struct pmu *pmu) for (i = cpuhw->n_txn_start; i < n; ++i) cpuhw->event[i]->hw.config = cpuhw->events[i]; - cpuhw->group_flag &= ~PERF_EVENT_TXN; cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 19138be412d6..cb774ff6e749 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -536,7 +536,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) * For group events transaction, the authorization check is * done in cpumf_pmu_commit_txn(). */ - if (!(cpuhw->flags & PERF_EVENT_TXN)) + if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD)) if (validate_ctr_auth(&event->hw)) return -EPERM; @@ -590,7 +590,6 @@ static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - cpuhw->flags |= PERF_EVENT_TXN; cpuhw->tx_state = cpuhw->state; } @@ -613,7 +612,6 @@ static void cpumf_pmu_cancel_txn(struct pmu *pmu) WARN_ON(cpuhw->tx_state != cpuhw->state); - cpuhw->flags &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); } @@ -640,7 +638,6 @@ static int cpumf_pmu_commit_txn(struct pmu *pmu) if ((state & cpuhw->info.auth_ctl) != state) return -EPERM; - cpuhw->flags &= ~PERF_EVENT_TXN; cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 2c0984d146ec..b0da5aedb336 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -108,7 +108,6 @@ struct cpu_hw_events { /* Enabled/disable state. */ int enabled; - unsigned int group_flag; unsigned int txn_flags; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -1380,7 +1379,7 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags) * skip the schedulability test here, it will be performed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto nocheck; if (check_excludes(cpuc->event, n0, 1)) @@ -1506,7 +1505,6 @@ static void sparc_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - cpuhw->group_flag |= PERF_EVENT_TXN; } /* @@ -1526,7 +1524,6 @@ static void sparc_pmu_cancel_txn(struct pmu *pmu) if (txn_flags & ~PERF_PMU_TXN_ADD) return; - cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); } @@ -1556,7 +1553,6 @@ static int sparc_pmu_commit_txn(struct pmu *pmu) if (sparc_check_constraints(cpuc->event, cpuc->events, n)) return -EAGAIN; - cpuc->group_flag &= ~PERF_EVENT_TXN; cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index dc77ef470e0e..4562cf070c27 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1175,7 +1175,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1326,7 +1326,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) return; /* @@ -1764,7 +1764,6 @@ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); __this_cpu_write(cpu_hw_events.n_txn, 0); } @@ -1785,7 +1784,6 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) if (txn_flags & ~PERF_PMU_TXN_ADD) return; - __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). @@ -1830,7 +1828,6 @@ static int x86_pmu_commit_txn(struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - cpuc->group_flag &= ~PERF_EVENT_TXN; cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index c99c93b9e348..953a0e4e3284 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -195,7 +195,6 @@ struct cpu_hw_events { int n_excl; /* the number of exclusive events */ - unsigned int group_flag; unsigned int txn_flags; int is_fake; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b83cea932f74..d841d33bcdc9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -199,8 +199,6 @@ struct perf_event; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ -#define PERF_EVENT_TXN 0x1 - #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ -- cgit v1.2.3 From 96f3eda67fcf2598e9d2794398e0e7ab35138ea6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 14 Sep 2015 10:14:07 -0400 Subject: perf/x86/intel: Fix static checker warning in lbr enable Commit deb27519bf1f ("perf/x86/intel: Fix LBR callstack issue caused by FREEZE_LBRS_ON_PMI") leads to the following Smatch complaint: warn: variable dereferenced before check 'cpuc->lbr_sel' (see line 154) Fix the warning. Reported-by: Dan Carpenter Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: deb27519bf1f ("perf/x86/intel: Fix LBR callstack issue caused by FREEZE_LBRS_ON_PMI") Link: http://lkml.kernel.org/r/1442240047-48149-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index a1d07c71d3b6..ad0b8b0490a0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -151,8 +151,9 @@ static void __intel_pmu_lbr_enable(bool pmi) * No need to reprogram LBR_SELECT in a PMI, as it * did not change. */ - lbr_select = cpuc->lbr_sel->config; - if (cpuc->lbr_sel && !pmi) + if (cpuc->lbr_sel) + lbr_select = cpuc->lbr_sel->config; + if (!pmi) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); -- cgit v1.2.3 From 18ab2cd3ee9d52dc64c5ae984146a261a328c4e8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 27 Sep 2015 23:25:50 +0800 Subject: perf/core, perf/x86: Change needlessly global functions and a variable to static Fixes various sparse warnings. Signed-off-by: Geliang Tang Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/70c14234da1bed6e3e67b9c419e2d5e376ab4f32.1443367286.git.geliangtang@163.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- kernel/events/core.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index be4febc58b94..e38d338a6447 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -157,7 +157,7 @@ struct _cpuid4_info_regs { struct amd_northbridge *nb; }; -unsigned short num_cache_leaves; +static unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: @@ -326,7 +326,7 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb) * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; @@ -403,8 +403,8 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, - unsigned long index) +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long index) { int ret = 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index f87b434c3c1e..ea02109aee77 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; -void update_perf_cpu_limits(void) +static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; @@ -472,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, * mode SWOUT : schedule out everything * mode SWIN : schedule in based on cgroup for next */ -void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; struct pmu *pmu; @@ -7390,7 +7390,7 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } -DEFINE_PER_CPU(unsigned int, nop_txn_flags); +static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { @@ -7750,7 +7750,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) return ret; } -struct pmu *perf_init_event(struct perf_event *event) +static struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; -- cgit v1.2.3 From c2365b9388e8ec19305e3f449c1826e7493d156d Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 1 Oct 2015 14:22:02 +0530 Subject: perf/x86/intel/uncore: Do not use macro DEFINE_PCI_DEVICE_TABLE() The DEFINE_PCI_DEVICE_TABLE() macro is deprecated. Use 'struct pci_device_id' instead of DEFINE_PCI_DEVICE_TABLE(), with the goal of getting rid of this macro completely. This Coccinelle semantic patch performs this transformation: @@ identifier a; declarer name DEFINE_PCI_DEVICE_TABLE; initializer i; @@ - DEFINE_PCI_DEVICE_TABLE(a) + const struct pci_device_id a[] = i; Signed-off-by: Vaishali Thakkar Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20151001085201.GA16939@localhost Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 694510a887dc..8cbb3f386b07 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2444,7 +2444,7 @@ static struct intel_uncore_type *bdx_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { +static const struct pci_device_id bdx_uncore_pci_ids[] = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), -- cgit v1.2.3 From 7ce1346a6842550a3c4c453cdf1c7b81fb60b07e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 28 Sep 2015 08:30:04 -0400 Subject: perf/x86: Add Intel cstate PMUs support This patch adds new PMUs to support cstate related free running (read-only) counters. These counters may be used simultaneously by other tools, such as turbostat. However, it still make sense to implement them in perf. Because we can conveniently collect them together with other events, and allow to use them from tools without special MSR access code. These counters include CORE_C*_RESIDENCY and PKG_C*_RESIDENCY. According to counters' scope and category, two PMUs are registered with the perf_event core subsystem. - 'cstate_core': The counter is available for each physical core. The counters include CORE_C*_RESIDENCY. - 'cstate_pkg': The counter is available for each physical package. The counters include PKG_C*_RESIDENCY. The events are exposed in sysfs for use by perf stat and other tools. The files are: /sys/devices/cstate_core/events/c*-residency /sys/devices/cstate_pkg/events/c*-residency These events only support system-wide mode counting. The /sys/devices/cstate_*/cpumask file can be used by tools to figure out which CPUs to monitor by default. The PMU type (attr->type) is dynamically allocated and is available from /sys/devices/core_misc/type and /sys/device/cstate_*/type. Sampling is not supported. Here is an example. - To caculate the fraction of time when the core is running in C6 state CORE_C6_time% = CORE_C6_RESIDENCY / TSC # perf stat -x, -e"cstate_core/c6-residency/,msr/tsc/" -C0 -- taskset -c 0 sleep 5 11838820015,,cstate_core/c6-residency/,5175919658,100.00 11877130740,,msr/tsc/,5175922010,100.00 For sleep, 99.7% of time we ran in C6 state. # perf stat -x, -e"cstate_core/c6-residency/,msr/tsc/" -C0 -- taskset -c 0 busyloop 1253316,,cstate_core/c6-residency/,4360969154,100.00 10012635248,,msr/tsc/,4360972366,100.00 For busyloop, 0.01% of time we ran in C6 state. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arjan van de Ven Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1443443404-8581-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/perf_event_intel_cstate.c | 694 ++++++++++++++++++++++++++ 2 files changed, 695 insertions(+) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cstate.c (limited to 'arch') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4eb065c6bed2..58031303e304 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c new file mode 100644 index 000000000000..75a38b5a2e26 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_cstate.c @@ -0,0 +1,694 @@ +/* + * perf_event_intel_cstate.c: support cstate residency counters + * + * Copyright (C) 2015, Intel Corp. + * Author: Kan Liang (kan.liang@intel.com) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + */ + +/* + * This file export cstate related free running (read-only) counters + * for perf. These counters may be use simultaneously by other tools, + * such as turbostat. However, it still make sense to implement them + * in perf. Because we can conveniently collect them together with + * other events, and allow to use them from tools without special MSR + * access code. + * + * The events only support system-wide mode counting. There is no + * sampling support because it is not supported by the hardware. + * + * According to counters' scope and category, two PMUs are registered + * with the perf_event core subsystem. + * - 'cstate_core': The counter is available for each physical core. + * The counters include CORE_C*_RESIDENCY. + * - 'cstate_pkg': The counter is available for each physical package. + * The counters include PKG_C*_RESIDENCY. + * + * All of these counters are specified in the IntelĀ® 64 and IA-32 + * Architectures Software Developer.s Manual Vol3b. + * + * Model specific counters: + * MSR_CORE_C1_RES: CORE C1 Residency Counter + * perf code: 0x00 + * Available model: SLM,AMT + * Scope: Core (each processor core has a MSR) + * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter + * perf code: 0x01 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter + * perf code: 0x02 + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter + * perf code: 0x03 + * Available model: SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. + * perf code: 0x00 + * Available model: SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. + * perf code: 0x01 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. + * perf code: 0x02 + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. + * perf code: 0x03 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. + * perf code: 0x04 + * Available model: HSW ULT only + * Scope: Package (physical package) + * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. + * perf code: 0x05 + * Available model: HSW ULT only + * Scope: Package (physical package) + * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. + * perf code: 0x06 + * Available model: HSW ULT only + * Scope: Package (physical package) + * + */ + +#include +#include +#include +#include +#include "perf_event.h" + +#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __cstate_##_var##_show, NULL) + +static ssize_t cstate_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf); + +struct perf_cstate_msr { + u64 msr; + struct perf_pmu_events_attr *attr; + bool (*test)(int idx); +}; + + +/* cstate_core PMU */ + +static struct pmu cstate_core_pmu; +static bool has_cstate_core; + +enum perf_cstate_core_id { + /* + * cstate_core events + */ + PERF_CSTATE_CORE_C1_RES = 0, + PERF_CSTATE_CORE_C3_RES, + PERF_CSTATE_CORE_C6_RES, + PERF_CSTATE_CORE_C7_RES, + + PERF_CSTATE_CORE_EVENT_MAX, +}; + +bool test_core(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES || + idx == PERF_CSTATE_CORE_C7_RES) + return true; + break; + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_CSTATE_CORE_C1_RES || + idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + } + + return false; +} + +PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); + +static struct perf_cstate_msr core_msr[] = { + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, }, +}; + +static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group core_events_attr_group = { + .name = "events", + .attrs = core_events_attrs, +}; + +DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); +static struct attribute *core_format_attrs[] = { + &format_attr_core_event.attr, + NULL, +}; + +static struct attribute_group core_format_attr_group = { + .name = "format", + .attrs = core_format_attrs, +}; + +static cpumask_t cstate_core_cpu_mask; +static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL); + +static struct attribute *cstate_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group cpumask_attr_group = { + .attrs = cstate_cpumask_attrs, +}; + +static const struct attribute_group *core_attr_groups[] = { + &core_events_attr_group, + &core_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +/* cstate_core PMU end */ + + +/* cstate_pkg PMU */ + +static struct pmu cstate_pkg_pmu; +static bool has_cstate_pkg; + +enum perf_cstate_pkg_id { + /* + * cstate_pkg events + */ + PERF_CSTATE_PKG_C2_RES = 0, + PERF_CSTATE_PKG_C3_RES, + PERF_CSTATE_PKG_C6_RES, + PERF_CSTATE_PKG_C7_RES, + PERF_CSTATE_PKG_C8_RES, + PERF_CSTATE_PKG_C9_RES, + PERF_CSTATE_PKG_C10_RES, + + PERF_CSTATE_PKG_EVENT_MAX, +}; + +bool test_pkg(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES || + idx == PERF_CSTATE_CORE_C7_RES) + return true; + break; + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_CSTATE_PKG_C2_RES || + idx == PERF_CSTATE_PKG_C3_RES || + idx == PERF_CSTATE_PKG_C6_RES || + idx == PERF_CSTATE_PKG_C7_RES) + return true; + break; + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + case 69: /* 22nm Haswell ULT */ + if (idx == PERF_CSTATE_PKG_C2_RES || + idx == PERF_CSTATE_PKG_C3_RES || + idx == PERF_CSTATE_PKG_C6_RES || + idx == PERF_CSTATE_PKG_C7_RES || + idx == PERF_CSTATE_PKG_C8_RES || + idx == PERF_CSTATE_PKG_C9_RES || + idx == PERF_CSTATE_PKG_C10_RES) + return true; + break; + } + + return false; +} + +PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03"); +PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04"); +PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); +PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); + +static struct perf_cstate_msr pkg_msr[] = { + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, }, +}; + +static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group pkg_events_attr_group = { + .name = "events", + .attrs = pkg_events_attrs, +}; + +DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); +static struct attribute *pkg_format_attrs[] = { + &format_attr_pkg_event.attr, + NULL, +}; +static struct attribute_group pkg_format_attr_group = { + .name = "format", + .attrs = pkg_format_attrs, +}; + +static cpumask_t cstate_pkg_cpu_mask; + +static const struct attribute_group *pkg_attr_groups[] = { + &pkg_events_attr_group, + &pkg_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +/* cstate_pkg PMU end*/ + +static ssize_t cstate_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + if (pmu == &cstate_core_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); + else if (pmu == &cstate_pkg_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); + else + return 0; +} + +static int cstate_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config; + int ret = 0; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + if (event->pmu == &cstate_core_pmu) { + if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) + return -EINVAL; + if (!core_msr[cfg].attr) + return -EINVAL; + event->hw.event_base = core_msr[cfg].msr; + } else if (event->pmu == &cstate_pkg_pmu) { + if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) + return -EINVAL; + if (!pkg_msr[cfg].attr) + return -EINVAL; + event->hw.event_base = pkg_msr[cfg].msr; + } else + return -ENOENT; + + /* must be done before validate_group */ + event->hw.config = cfg; + event->hw.idx = -1; + + return ret; +} + +static inline u64 cstate_pmu_read_counter(struct perf_event *event) +{ + u64 val; + + rdmsrl(event->hw.event_base, val); + return val; +} + +static void cstate_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = cstate_pmu_read_counter(event); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + local64_add(new_raw_count - prev_raw_count, &event->count); +} + +static void cstate_pmu_event_start(struct perf_event *event, int mode) +{ + local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event)); +} + +static void cstate_pmu_event_stop(struct perf_event *event, int mode) +{ + cstate_pmu_event_update(event); +} + +static void cstate_pmu_event_del(struct perf_event *event, int mode) +{ + cstate_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int cstate_pmu_event_add(struct perf_event *event, int mode) +{ + if (mode & PERF_EF_START) + cstate_pmu_event_start(event, mode); + + return 0; +} + +static void cstate_cpu_exit(int cpu) +{ + int i, id, target; + + /* cpu exit for cstate core */ + if (has_cstate_core) { + id = topology_core_id(cpu); + target = -1; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (id == topology_core_id(i)) { + target = i; + break; + } + } + if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &cstate_core_cpu_mask); + WARN_ON(cpumask_empty(&cstate_core_cpu_mask)); + if (target >= 0) + perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); + } + + /* cpu exit for cstate pkg */ + if (has_cstate_pkg) { + id = topology_physical_package_id(cpu); + target = -1; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (id == topology_physical_package_id(i)) { + target = i; + break; + } + } + if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &cstate_pkg_cpu_mask); + WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask)); + if (target >= 0) + perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); + } +} + +static void cstate_cpu_init(int cpu) +{ + int i, id; + + /* cpu init for cstate core */ + if (has_cstate_core) { + id = topology_core_id(cpu); + for_each_cpu(i, &cstate_core_cpu_mask) { + if (id == topology_core_id(i)) + break; + } + if (i >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_core_cpu_mask); + } + + /* cpu init for cstate pkg */ + if (has_cstate_pkg) { + id = topology_physical_package_id(cpu); + for_each_cpu(i, &cstate_pkg_cpu_mask) { + if (id == topology_physical_package_id(i)) + break; + } + if (i >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); + } +} + +static int cstate_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + break; + case CPU_STARTING: + cstate_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + break; + case CPU_ONLINE: + case CPU_DEAD: + break; + case CPU_DOWN_PREPARE: + cstate_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +/* + * Probe the cstate events and insert the available one into sysfs attrs + * Return false if there is no available events. + */ +static bool cstate_probe_msr(struct perf_cstate_msr *msr, + struct attribute **events_attrs, + int max_event_nr) +{ + int i, j = 0; + u64 val; + + /* Probe the cstate events. */ + for (i = 0; i < max_event_nr; i++) { + if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) + msr[i].attr = NULL; + } + + /* List remaining events in the sysfs attrs. */ + for (i = 0; i < max_event_nr; i++) { + if (msr[i].attr) + events_attrs[j++] = &msr[i].attr->attr.attr; + } + events_attrs[j] = NULL; + + return (j > 0) ? true : false; +} + +static int __init cstate_init(void) +{ + /* SLM has different MSR for PKG C6 */ + switch (boot_cpu_data.x86_model) { + case 55: + case 76: + case 77: + pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; + } + + if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX)) + has_cstate_core = true; + + if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX)) + has_cstate_pkg = true; + + return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; +} + +static void __init cstate_cpumask_init(void) +{ + int cpu; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) + cstate_cpu_init(cpu); + + __perf_cpu_notifier(cstate_cpu_notifier); + + cpu_notifier_register_done(); +} + +static struct pmu cstate_core_pmu = { + .attr_groups = core_attr_groups, + .name = "cstate_core", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, /* must have */ + .del = cstate_pmu_event_del, /* must have */ + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static struct pmu cstate_pkg_pmu = { + .attr_groups = pkg_attr_groups, + .name = "cstate_pkg", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, /* must have */ + .del = cstate_pmu_event_del, /* must have */ + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static void __init cstate_pmus_register(void) +{ + int err; + + if (has_cstate_core) { + err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); + if (WARN_ON(err)) + pr_info("Failed to register PMU %s error %d\n", + cstate_core_pmu.name, err); + } + + if (has_cstate_pkg) { + err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); + if (WARN_ON(err)) + pr_info("Failed to register PMU %s error %d\n", + cstate_pkg_pmu.name, err); + } +} + +static int __init cstate_pmu_init(void) +{ + int err; + + if (cpu_has_hypervisor) + return -ENODEV; + + err = cstate_init(); + if (err) + return err; + + cstate_cpumask_init(); + + cstate_pmus_register(); + + return 0; +} + +device_initcall(cstate_pmu_init); -- cgit v1.2.3 From 712df65ccb63da08a484bf57c40b250dfd4103a7 Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Thu, 24 Sep 2015 21:10:21 +0900 Subject: perf/x86/intel/uncore: Fix multi-segment problem of perf_event_intel_uncore In multi-segment system, uncore devices may belong to buses whose segment number is other than 0: .... 0000:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03) ... 0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03) ... 0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03) ... 0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03 ... In that case, relation of bus number and physical id may be broken because "uncore_pcibus_to_physid" doesn't take account of PCI segment. For example, bus 0000:ff and 0001:ff uses the same entry of "uncore_pcibus_to_physid" array. This patch fixes this problem by introducing the segment-aware pci2phy_map instead. Signed-off-by: Taku Izumi Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1443096621-4119-1-git-send-email-izumi.taku@jp.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 61 ++++++++++++++++++++-- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 12 ++++- arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 16 ++++-- .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +++++++++--- 4 files changed, 106 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 560e5255b15e..61215a69b03d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; /* pci bus to socket mapping */ -int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, }; +DEFINE_RAW_SPINLOCK(pci2phy_map_lock); +struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; static DEFINE_RAW_SPINLOCK(uncore_box_lock); @@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +int uncore_pcibus_to_physid(struct pci_bus *bus) +{ + struct pci2phy_map *map; + int phys_id = -1; + + raw_spin_lock(&pci2phy_map_lock); + list_for_each_entry(map, &pci2phy_map_head, list) { + if (map->segment == pci_domain_nr(bus)) { + phys_id = map->pbus_to_physid[bus->number]; + break; + } + } + raw_spin_unlock(&pci2phy_map_lock); + + return phys_id; +} + +struct pci2phy_map *__find_pci2phy_map(int segment) +{ + struct pci2phy_map *map, *alloc = NULL; + int i; + + lockdep_assert_held(&pci2phy_map_lock); + +lookup: + list_for_each_entry(map, &pci2phy_map_head, list) { + if (map->segment == segment) + goto end; + } + + if (!alloc) { + raw_spin_unlock(&pci2phy_map_lock); + alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL); + raw_spin_lock(&pci2phy_map_lock); + + if (!alloc) + return NULL; + + goto lookup; + } + + map = alloc; + alloc = NULL; + map->segment = segment; + for (i = 0; i < 256; i++) + map->pbus_to_physid[i] = -1; + list_add_tail(&map->list, &pci2phy_map_head); + +end: + kfree(alloc); + return map; +} + ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id int phys_id; bool first_box = false; - phys_id = uncore_pcibus_to_physid[pdev->bus->number]; + phys_id = uncore_pcibus_to_physid(pdev->bus); if (phys_id < 0) return -ENODEV; @@ -856,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); struct intel_uncore_pmu *pmu; - int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number]; + int i, cpu, phys_id; bool last_box = false; + phys_id = uncore_pcibus_to_physid(pdev->bus); box = pci_get_drvdata(pdev); if (!box) { for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 72c54c2e5b1a..2f0a4a98e16b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -117,6 +117,15 @@ struct uncore_event_desc { const char *config; }; +struct pci2phy_map { + struct list_head list; + int segment; + int pbus_to_physid[256]; +}; + +int uncore_pcibus_to_physid(struct pci_bus *bus); +struct pci2phy_map *__find_pci2phy_map(int segment); + ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; extern struct pci_driver *uncore_pci_driver; -extern int uncore_pcibus_to_physid[256]; +extern raw_spinlock_t pci2phy_map_lock; +extern struct list_head pci2phy_map_head; extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; extern struct event_constraint uncore_constraint_empty; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index f78574b3cb55..845256158a10 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -420,15 +420,25 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags) static int snb_pci2phy_map_init(int devid) { struct pci_dev *dev = NULL; - int bus; + struct pci2phy_map *map; + int bus, segment; dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); if (!dev) return -ENOTTY; bus = dev->bus->number; - - uncore_pcibus_to_physid[bus] = 0; + segment = pci_domain_nr(dev->bus); + + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + pci_dev_put(dev); + return -ENOMEM; + } + map->pbus_to_physid[bus] = 0; + raw_spin_unlock(&pci2phy_map_lock); pci_dev_put(dev); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 8cbb3f386b07..f0f4fcba252e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -1087,7 +1087,8 @@ static struct pci_driver snbep_uncore_pci_driver = { static int snbep_pci2phy_map_init(int devid) { struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid; + int i, bus, nodeid, segment; + struct pci2phy_map *map; int err = 0; u32 config = 0; @@ -1106,16 +1107,27 @@ static int snbep_pci2phy_map_init(int devid) err = pci_read_config_dword(ubox_dev, 0x54, &config); if (err) break; + + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } + /* * every three bits in the Node ID mapping register maps * to a particular node. */ for (i = 0; i < 8; i++) { if (nodeid == ((config >> (3 * i)) & 0x7)) { - uncore_pcibus_to_physid[bus] = i; + map->pbus_to_physid[bus] = i; break; } } + raw_spin_unlock(&pci2phy_map_lock); } if (!err) { @@ -1123,13 +1135,17 @@ static int snbep_pci2phy_map_init(int devid) * For PCI bus with no UBOX device, find the next bus * that has UBOX device and use its mapping. */ - i = -1; - for (bus = 255; bus >= 0; bus--) { - if (uncore_pcibus_to_physid[bus] >= 0) - i = uncore_pcibus_to_physid[bus]; - else - uncore_pcibus_to_physid[bus] = i; + raw_spin_lock(&pci2phy_map_lock); + list_for_each_entry(map, &pci2phy_map_head, list) { + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (map->pbus_to_physid[bus] >= 0) + i = map->pbus_to_physid[bus]; + else + map->pbus_to_physid[bus] = i; + } } + raw_spin_unlock(&pci2phy_map_lock); } pci_dev_put(ubox_dev); -- cgit v1.2.3 From b9511cd761faafca7a1acc059e792c1399f9d7c6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 16 Oct 2015 16:24:05 +0300 Subject: perf/x86: Fix time_shift in perf_event_mmap_page Commit: b20112edeadf ("perf/x86: Improve accuracy of perf/sched clock") allowed the time_shift value in perf_event_mmap_page to be as much as 32. Unfortunately the documented algorithms for using time_shift have it shifting an integer, whereas to work correctly with the value 32, the type must be u64. In the case of perf tools, Intel PT decodes correctly but the timestamps that are output (for example by perf script) have lost 32-bits of granularity so they look like they are not changing at all. Fix by limiting the shift to 31 and adjusting the multiplier accordingly. Also update the documentation of perf_event_mmap_page so that new code based on it will be more future-proof. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: b20112edeadf ("perf/x86: Improve accuracy of perf/sched clock") Link: http://lkml.kernel.org/r/1445001845-13688-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 11 +++++++++++ include/uapi/linux/perf_event.h | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 69b84a26ea17..c7c4d9c51e99 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -259,6 +259,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, NSEC_PER_MSEC, 0); + /* + * cyc2ns_shift is exported via arch_perf_update_userpage() where it is + * not expected to be greater than 31 due to the original published + * conversion algorithm shifting a 32-bit value (now specifies a 64-bit + * value) - refer perf_event_mmap_page documentation in perf_event.h. + */ + if (data->cyc2ns_shift == 32) { + data->cyc2ns_shift = 31; + data->cyc2ns_mul >>= 1; + } + data->cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 2881145cda86..6c72e72e975c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -476,7 +476,7 @@ struct perf_event_mmap_page { * u64 delta; * * quot = (cyc >> time_shift); - * rem = cyc & ((1 << time_shift) - 1); + * rem = cyc & (((u64)1 << time_shift) - 1); * delta = time_offset + quot * time_mult + * ((rem * time_mult) >> time_shift); * @@ -507,7 +507,7 @@ struct perf_event_mmap_page { * And vice versa: * * quot = cyc >> time_shift; - * rem = cyc & ((1 << time_shift) - 1); + * rem = cyc & (((u64)1 << time_shift) - 1); * timestamp = time_zero + quot * time_mult + * ((rem * time_mult) >> time_shift); */ -- cgit v1.2.3 From d892819faa6860d469aae71d70c336b391c25505 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 13 Oct 2015 09:09:09 +0200 Subject: perf/x86: Add support for PERF_SAMPLE_BRANCH_CALL This patch enables the suport for the PERF_SAMPLE_BRANCH_CALL for Intel x86 processors. When the processor support LBR filtering this the selection is done in hardware. Otherwise, the filter is applied by software. Note that we chose to include zero length calls because they also represent calls. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: khandual@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1444720151-10275-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index ad0b8b0490a0..bfd0b717e944 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -555,6 +555,8 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) mask |= X86_BR_IND_JMP; + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -890,6 +892,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -905,6 +908,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | LBR_CALL_STACK, [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; /* core */ -- cgit v1.2.3 From 24f1a79a5fc10858e05ee0bf651ec99abfc0319b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 13 Oct 2015 09:09:10 +0200 Subject: perf/powerpc: Add support for PERF_SAMPLE_BRANCH_CALL The patch catches PERF_SAMPLE_BRANCH_CALL because it is not clear whether this is actually supported by the hardware. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: khandual@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1444720151-10275-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/powerpc/perf/power8-pmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 396351db601b..7d5e295255b7 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -676,6 +676,9 @@ static u64 power8_bhrb_filter_map(u64 branch_sample_type) if (branch_sample_type & PERF_SAMPLE_BRANCH_IND_CALL) return -1; + if (branch_sample_type & PERF_SAMPLE_BRANCH_CALL) + return -1; + if (branch_sample_type & PERF_SAMPLE_BRANCH_ANY_CALL) { pmu_bhrb_filter |= POWER8_MMCRA_IFM1; return pmu_bhrb_filter; -- cgit v1.2.3