Source file src/cmd/compile/internal/amd64/ssa.go

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"cmd/compile/internal/base"
    12  	"cmd/compile/internal/ir"
    13  	"cmd/compile/internal/logopt"
    14  	"cmd/compile/internal/objw"
    15  	"cmd/compile/internal/ssa"
    16  	"cmd/compile/internal/ssagen"
    17  	"cmd/compile/internal/types"
    18  	"cmd/internal/obj"
    19  	"cmd/internal/obj/x86"
    20  	"internal/abi"
    21  )
    22  
    23  // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
    24  func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
    25  	flive := b.FlagsLiveAtEnd
    26  	for _, c := range b.ControlValues() {
    27  		flive = c.Type.IsFlags() || flive
    28  	}
    29  	for i := len(b.Values) - 1; i >= 0; i-- {
    30  		v := b.Values[i]
    31  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    32  			// The "mark" is any non-nil Aux value.
    33  			v.Aux = ssa.AuxMark
    34  		}
    35  		if v.Type.IsFlags() {
    36  			flive = false
    37  		}
    38  		for _, a := range v.Args {
    39  			if a.Type.IsFlags() {
    40  				flive = true
    41  			}
    42  		}
    43  	}
    44  }
    45  
    46  func isFPReg(r int16) bool {
    47  	return x86.REG_X0 <= r && r <= x86.REG_Z31
    48  }
    49  
    50  func isKReg(r int16) bool {
    51  	return x86.REG_K0 <= r && r <= x86.REG_K7
    52  }
    53  
    54  func isLowFPReg(r int16) bool {
    55  	return x86.REG_X0 <= r && r <= x86.REG_X15
    56  }
    57  
    58  // loadByRegWidth returns the load instruction of the given register of a given width.
    59  func loadByRegWidth(r int16, width int64) obj.As {
    60  	// Avoid partial register write for GPR
    61  	if !isFPReg(r) && !isKReg(r) {
    62  		switch width {
    63  		case 1:
    64  			return x86.AMOVBLZX
    65  		case 2:
    66  			return x86.AMOVWLZX
    67  		}
    68  	}
    69  	// Otherwise, there's no difference between load and store opcodes.
    70  	return storeByRegWidth(r, width)
    71  }
    72  
    73  // storeByRegWidth returns the store instruction of the given register of a given width.
    74  // It's also used for loading const to a reg.
    75  func storeByRegWidth(r int16, width int64) obj.As {
    76  	if isFPReg(r) {
    77  		switch width {
    78  		case 4:
    79  			return x86.AMOVSS
    80  		case 8:
    81  			return x86.AMOVSD
    82  		case 16:
    83  			// int128s are in SSE registers
    84  			if isLowFPReg(r) {
    85  				return x86.AMOVUPS
    86  			} else {
    87  				return x86.AVMOVDQU
    88  			}
    89  		case 32:
    90  			return x86.AVMOVDQU
    91  		case 64:
    92  			return x86.AVMOVDQU64
    93  		}
    94  	}
    95  	if isKReg(r) {
    96  		return x86.AKMOVQ
    97  	}
    98  	// gp
    99  	switch width {
   100  	case 1:
   101  		return x86.AMOVB
   102  	case 2:
   103  		return x86.AMOVW
   104  	case 4:
   105  		return x86.AMOVL
   106  	case 8:
   107  		return x86.AMOVQ
   108  	}
   109  	panic(fmt.Sprintf("bad store reg=%v, width=%d", r, width))
   110  }
   111  
   112  // moveByRegsWidth returns the reg->reg move instruction of the given dest/src registers of a given width.
   113  func moveByRegsWidth(dest, src int16, width int64) obj.As {
   114  	// fp -> fp
   115  	if isFPReg(dest) && isFPReg(src) {
   116  		// Moving the whole sse2 register is faster
   117  		// than moving just the correct low portion of it.
   118  		// There is no xmm->xmm move with 1 byte opcode,
   119  		// so use movups, which has 2 byte opcode.
   120  		if isLowFPReg(dest) && isLowFPReg(src) && width <= 16 {
   121  			return x86.AMOVUPS
   122  		}
   123  		if width <= 32 {
   124  			return x86.AVMOVDQU
   125  		}
   126  		return x86.AVMOVDQU64
   127  	}
   128  	// k -> gp, gp -> k, k -> k
   129  	if isKReg(dest) || isKReg(src) {
   130  		if isFPReg(dest) || isFPReg(src) {
   131  			panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
   132  		}
   133  		return x86.AKMOVQ
   134  	}
   135  	// gp -> fp, fp -> gp, gp -> gp
   136  	switch width {
   137  	case 1:
   138  		// Avoids partial register write
   139  		return x86.AMOVL
   140  	case 2:
   141  		return x86.AMOVL
   142  	case 4:
   143  		return x86.AMOVL
   144  	case 8:
   145  		return x86.AMOVQ
   146  	case 16:
   147  		if isLowFPReg(dest) && isLowFPReg(src) {
   148  			// int128s are in SSE registers
   149  			return x86.AMOVUPS
   150  		} else {
   151  			return x86.AVMOVDQU
   152  		}
   153  	case 32:
   154  		return x86.AVMOVDQU
   155  	case 64:
   156  		return x86.AVMOVDQU64
   157  	}
   158  	panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
   159  }
   160  
   161  // opregreg emits instructions for
   162  //
   163  //	dest := dest(To) op src(From)
   164  //
   165  // and also returns the created obj.Prog so it
   166  // may be further adjusted (offset, scale, etc).
   167  func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog {
   168  	p := s.Prog(op)
   169  	p.From.Type = obj.TYPE_REG
   170  	p.To.Type = obj.TYPE_REG
   171  	p.To.Reg = dest
   172  	p.From.Reg = src
   173  	return p
   174  }
   175  
   176  // memIdx fills out a as an indexed memory reference for v.
   177  // It assumes that the base register and the index register
   178  // are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
   179  // The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
   180  func memIdx(a *obj.Addr, v *ssa.Value) {
   181  	r, i := v.Args[0].Reg(), v.Args[1].Reg()
   182  	a.Type = obj.TYPE_MEM
   183  	a.Scale = v.Op.Scale()
   184  	if a.Scale == 1 && i == x86.REG_SP {
   185  		r, i = i, r
   186  	}
   187  	a.Reg = r
   188  	a.Index = i
   189  }
   190  
   191  func getgFromTLS(s *ssagen.State, r int16) {
   192  	// See the comments in cmd/internal/obj/x86/obj6.go
   193  	// near CanUse1InsnTLS for a detailed explanation of these instructions.
   194  	if x86.CanUse1InsnTLS(base.Ctxt) {
   195  		// MOVQ (TLS), r
   196  		p := s.Prog(x86.AMOVQ)
   197  		p.From.Type = obj.TYPE_MEM
   198  		p.From.Reg = x86.REG_TLS
   199  		p.To.Type = obj.TYPE_REG
   200  		p.To.Reg = r
   201  	} else {
   202  		// MOVQ TLS, r
   203  		// MOVQ (r)(TLS*1), r
   204  		p := s.Prog(x86.AMOVQ)
   205  		p.From.Type = obj.TYPE_REG
   206  		p.From.Reg = x86.REG_TLS
   207  		p.To.Type = obj.TYPE_REG
   208  		p.To.Reg = r
   209  		q := s.Prog(x86.AMOVQ)
   210  		q.From.Type = obj.TYPE_MEM
   211  		q.From.Reg = r
   212  		q.From.Index = x86.REG_TLS
   213  		q.From.Scale = 1
   214  		q.To.Type = obj.TYPE_REG
   215  		q.To.Reg = r
   216  	}
   217  }
   218  
   219  func ssaGenValue(s *ssagen.State, v *ssa.Value) {
   220  	switch v.Op {
   221  	case ssa.OpAMD64VFMADD231SD, ssa.OpAMD64VFMADD231SS:
   222  		p := s.Prog(v.Op.Asm())
   223  		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
   224  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   225  		p.AddRestSourceReg(v.Args[1].Reg())
   226  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   227  		r := v.Reg()
   228  		r1 := v.Args[0].Reg()
   229  		r2 := v.Args[1].Reg()
   230  		switch {
   231  		case r == r1:
   232  			p := s.Prog(v.Op.Asm())
   233  			p.From.Type = obj.TYPE_REG
   234  			p.From.Reg = r2
   235  			p.To.Type = obj.TYPE_REG
   236  			p.To.Reg = r
   237  		case r == r2:
   238  			p := s.Prog(v.Op.Asm())
   239  			p.From.Type = obj.TYPE_REG
   240  			p.From.Reg = r1
   241  			p.To.Type = obj.TYPE_REG
   242  			p.To.Reg = r
   243  		default:
   244  			var asm obj.As
   245  			if v.Op == ssa.OpAMD64ADDQ {
   246  				asm = x86.ALEAQ
   247  			} else {
   248  				asm = x86.ALEAL
   249  			}
   250  			p := s.Prog(asm)
   251  			p.From.Type = obj.TYPE_MEM
   252  			p.From.Reg = r1
   253  			p.From.Scale = 1
   254  			p.From.Index = r2
   255  			p.To.Type = obj.TYPE_REG
   256  			p.To.Reg = r
   257  		}
   258  	// 2-address opcode arithmetic
   259  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   260  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   261  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   262  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   263  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   264  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   265  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   266  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   267  		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
   268  		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
   269  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   270  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   271  		ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
   272  		ssa.OpAMD64POR, ssa.OpAMD64PXOR,
   273  		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
   274  		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
   275  		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
   276  		ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
   277  		ssa.OpAMD64PUNPCKLBW:
   278  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   279  
   280  	case ssa.OpAMD64PSHUFLW:
   281  		p := s.Prog(v.Op.Asm())
   282  		imm := v.AuxInt
   283  		if imm < 0 || imm > 255 {
   284  			v.Fatalf("Invalid source selection immediate")
   285  		}
   286  		p.From.Offset = imm
   287  		p.From.Type = obj.TYPE_CONST
   288  		p.AddRestSourceReg(v.Args[0].Reg())
   289  		p.To.Type = obj.TYPE_REG
   290  		p.To.Reg = v.Reg()
   291  
   292  	case ssa.OpAMD64PSHUFBbroadcast:
   293  		// PSHUFB with a control mask of zero copies byte 0 to all
   294  		// bytes in the register.
   295  		//
   296  		// X15 is always zero with ABIInternal.
   297  		if s.ABI != obj.ABIInternal {
   298  			// zero X15 manually
   299  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   300  		}
   301  
   302  		p := s.Prog(v.Op.Asm())
   303  		p.From.Type = obj.TYPE_REG
   304  		p.To.Type = obj.TYPE_REG
   305  		p.To.Reg = v.Reg()
   306  		p.From.Reg = x86.REG_X15
   307  
   308  	case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
   309  		p := s.Prog(v.Op.Asm())
   310  		lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
   311  		p.From.Type = obj.TYPE_REG
   312  		p.From.Reg = bits
   313  		p.To.Type = obj.TYPE_REG
   314  		p.To.Reg = lo
   315  		p.AddRestSourceReg(hi)
   316  
   317  	case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
   318  		ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
   319  		ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   320  		p := s.Prog(v.Op.Asm())
   321  		p.From.Type = obj.TYPE_REG
   322  		p.From.Reg = v.Args[0].Reg()
   323  		p.To.Type = obj.TYPE_REG
   324  		switch v.Op {
   325  		case ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   326  			p.To.Reg = v.Reg0()
   327  		default:
   328  			p.To.Reg = v.Reg()
   329  		}
   330  
   331  	case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
   332  		p := s.Prog(v.Op.Asm())
   333  		p.From.Type = obj.TYPE_REG
   334  		p.From.Reg = v.Args[0].Reg()
   335  		p.To.Type = obj.TYPE_REG
   336  		p.To.Reg = v.Reg()
   337  		p.AddRestSourceReg(v.Args[1].Reg())
   338  
   339  	case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ,
   340  		ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ,
   341  		ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ:
   342  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   343  		p.AddRestSourceReg(v.Args[0].Reg())
   344  
   345  	case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
   346  		ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload,
   347  		ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload:
   348  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   349  		m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
   350  		ssagen.AddAux(&m, v)
   351  		p.AddRestSource(m)
   352  
   353  	case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8,
   354  		ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8,
   355  		ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8,
   356  		ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8,
   357  		ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8,
   358  		ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8:
   359  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg())
   360  		m := obj.Addr{Type: obj.TYPE_MEM}
   361  		memIdx(&m, v)
   362  		ssagen.AddAux(&m, v)
   363  		p.AddRestSource(m)
   364  
   365  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   366  		// Arg[0] (the dividend) is in AX.
   367  		// Arg[1] (the divisor) can be in any other register.
   368  		// Result[0] (the quotient) is in AX.
   369  		// Result[1] (the remainder) is in DX.
   370  		r := v.Args[1].Reg()
   371  
   372  		// Zero extend dividend.
   373  		opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   374  
   375  		// Issue divide.
   376  		p := s.Prog(v.Op.Asm())
   377  		p.From.Type = obj.TYPE_REG
   378  		p.From.Reg = r
   379  
   380  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   381  		// Arg[0] (the dividend) is in AX.
   382  		// Arg[1] (the divisor) can be in any other register.
   383  		// Result[0] (the quotient) is in AX.
   384  		// Result[1] (the remainder) is in DX.
   385  		r := v.Args[1].Reg()
   386  
   387  		var opCMP, opNEG, opSXD obj.As
   388  		switch v.Op {
   389  		case ssa.OpAMD64DIVQ:
   390  			opCMP, opNEG, opSXD = x86.ACMPQ, x86.ANEGQ, x86.ACQO
   391  		case ssa.OpAMD64DIVL:
   392  			opCMP, opNEG, opSXD = x86.ACMPL, x86.ANEGL, x86.ACDQ
   393  		case ssa.OpAMD64DIVW:
   394  			opCMP, opNEG, opSXD = x86.ACMPW, x86.ANEGW, x86.ACWD
   395  		}
   396  
   397  		// CPU faults upon signed overflow, which occurs when the most
   398  		// negative int is divided by -1. Handle divide by -1 as a special case.
   399  		var j1, j2 *obj.Prog
   400  		if ssa.DivisionNeedsFixUp(v) {
   401  			c := s.Prog(opCMP)
   402  			c.From.Type = obj.TYPE_REG
   403  			c.From.Reg = r
   404  			c.To.Type = obj.TYPE_CONST
   405  			c.To.Offset = -1
   406  
   407  			// Divisor is not -1, proceed with normal division.
   408  			j1 = s.Prog(x86.AJNE)
   409  			j1.To.Type = obj.TYPE_BRANCH
   410  
   411  			// Divisor is -1, manually compute quotient and remainder via fixup code.
   412  			// n / -1 = -n
   413  			n1 := s.Prog(opNEG)
   414  			n1.To.Type = obj.TYPE_REG
   415  			n1.To.Reg = x86.REG_AX
   416  
   417  			// n % -1 == 0
   418  			opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   419  
   420  			// TODO(khr): issue only the -1 fixup code we need.
   421  			// For instance, if only the quotient is used, no point in zeroing the remainder.
   422  
   423  			// Skip over normal division.
   424  			j2 = s.Prog(obj.AJMP)
   425  			j2.To.Type = obj.TYPE_BRANCH
   426  		}
   427  
   428  		// Sign extend dividend and perform division.
   429  		p := s.Prog(opSXD)
   430  		if j1 != nil {
   431  			j1.To.SetTarget(p)
   432  		}
   433  		p = s.Prog(v.Op.Asm())
   434  		p.From.Type = obj.TYPE_REG
   435  		p.From.Reg = r
   436  
   437  		if j2 != nil {
   438  			j2.To.SetTarget(s.Pc())
   439  		}
   440  
   441  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
   442  		// the frontend rewrites constant division by 8/16/32 bit integers into
   443  		// HMUL by a constant
   444  		// SSA rewrites generate the 64 bit versions
   445  
   446  		// Arg[0] is already in AX as it's the only register we allow
   447  		// and DX is the only output we care about (the high bits)
   448  		p := s.Prog(v.Op.Asm())
   449  		p.From.Type = obj.TYPE_REG
   450  		p.From.Reg = v.Args[1].Reg()
   451  
   452  		// IMULB puts the high portion in AH instead of DL,
   453  		// so move it to DL for consistency
   454  		if v.Type.Size() == 1 {
   455  			m := s.Prog(x86.AMOVB)
   456  			m.From.Type = obj.TYPE_REG
   457  			m.From.Reg = x86.REG_AH
   458  			m.To.Type = obj.TYPE_REG
   459  			m.To.Reg = x86.REG_DX
   460  		}
   461  
   462  	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
   463  		// Arg[0] is already in AX as it's the only register we allow
   464  		// results lo in AX
   465  		p := s.Prog(v.Op.Asm())
   466  		p.From.Type = obj.TYPE_REG
   467  		p.From.Reg = v.Args[1].Reg()
   468  
   469  	case ssa.OpAMD64MULQU2:
   470  		// Arg[0] is already in AX as it's the only register we allow
   471  		// results hi in DX, lo in AX
   472  		p := s.Prog(v.Op.Asm())
   473  		p.From.Type = obj.TYPE_REG
   474  		p.From.Reg = v.Args[1].Reg()
   475  
   476  	case ssa.OpAMD64DIVQU2:
   477  		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
   478  		// results q in AX, r in DX
   479  		p := s.Prog(v.Op.Asm())
   480  		p.From.Type = obj.TYPE_REG
   481  		p.From.Reg = v.Args[2].Reg()
   482  
   483  	case ssa.OpAMD64AVGQU:
   484  		// compute (x+y)/2 unsigned.
   485  		// Do a 64-bit add, the overflow goes into the carry.
   486  		// Shift right once and pull the carry back into the 63rd bit.
   487  		p := s.Prog(x86.AADDQ)
   488  		p.From.Type = obj.TYPE_REG
   489  		p.To.Type = obj.TYPE_REG
   490  		p.To.Reg = v.Reg()
   491  		p.From.Reg = v.Args[1].Reg()
   492  		p = s.Prog(x86.ARCRQ)
   493  		p.From.Type = obj.TYPE_CONST
   494  		p.From.Offset = 1
   495  		p.To.Type = obj.TYPE_REG
   496  		p.To.Reg = v.Reg()
   497  
   498  	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
   499  		r := v.Reg0()
   500  		r0 := v.Args[0].Reg()
   501  		r1 := v.Args[1].Reg()
   502  		switch r {
   503  		case r0:
   504  			p := s.Prog(v.Op.Asm())
   505  			p.From.Type = obj.TYPE_REG
   506  			p.From.Reg = r1
   507  			p.To.Type = obj.TYPE_REG
   508  			p.To.Reg = r
   509  		case r1:
   510  			p := s.Prog(v.Op.Asm())
   511  			p.From.Type = obj.TYPE_REG
   512  			p.From.Reg = r0
   513  			p.To.Type = obj.TYPE_REG
   514  			p.To.Reg = r
   515  		default:
   516  			v.Fatalf("output not in same register as an input %s", v.LongString())
   517  		}
   518  
   519  	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
   520  		p := s.Prog(v.Op.Asm())
   521  		p.From.Type = obj.TYPE_REG
   522  		p.From.Reg = v.Args[1].Reg()
   523  		p.To.Type = obj.TYPE_REG
   524  		p.To.Reg = v.Reg0()
   525  
   526  	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
   527  		p := s.Prog(v.Op.Asm())
   528  		p.From.Type = obj.TYPE_CONST
   529  		p.From.Offset = v.AuxInt
   530  		p.To.Type = obj.TYPE_REG
   531  		p.To.Reg = v.Reg0()
   532  
   533  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   534  		r := v.Reg()
   535  		a := v.Args[0].Reg()
   536  		if r == a {
   537  			switch v.AuxInt {
   538  			case 1:
   539  				var asm obj.As
   540  				// Software optimization manual recommends add $1,reg.
   541  				// But inc/dec is 1 byte smaller. ICC always uses inc
   542  				// Clang/GCC choose depending on flags, but prefer add.
   543  				// Experiments show that inc/dec is both a little faster
   544  				// and make a binary a little smaller.
   545  				if v.Op == ssa.OpAMD64ADDQconst {
   546  					asm = x86.AINCQ
   547  				} else {
   548  					asm = x86.AINCL
   549  				}
   550  				p := s.Prog(asm)
   551  				p.To.Type = obj.TYPE_REG
   552  				p.To.Reg = r
   553  				return
   554  			case -1:
   555  				var asm obj.As
   556  				if v.Op == ssa.OpAMD64ADDQconst {
   557  					asm = x86.ADECQ
   558  				} else {
   559  					asm = x86.ADECL
   560  				}
   561  				p := s.Prog(asm)
   562  				p.To.Type = obj.TYPE_REG
   563  				p.To.Reg = r
   564  				return
   565  			case 0x80:
   566  				// 'SUBQ $-0x80, r' is shorter to encode than
   567  				// and functionally equivalent to 'ADDQ $0x80, r'.
   568  				asm := x86.ASUBL
   569  				if v.Op == ssa.OpAMD64ADDQconst {
   570  					asm = x86.ASUBQ
   571  				}
   572  				p := s.Prog(asm)
   573  				p.From.Type = obj.TYPE_CONST
   574  				p.From.Offset = -0x80
   575  				p.To.Type = obj.TYPE_REG
   576  				p.To.Reg = r
   577  				return
   578  
   579  			}
   580  			p := s.Prog(v.Op.Asm())
   581  			p.From.Type = obj.TYPE_CONST
   582  			p.From.Offset = v.AuxInt
   583  			p.To.Type = obj.TYPE_REG
   584  			p.To.Reg = r
   585  			return
   586  		}
   587  		var asm obj.As
   588  		if v.Op == ssa.OpAMD64ADDQconst {
   589  			asm = x86.ALEAQ
   590  		} else {
   591  			asm = x86.ALEAL
   592  		}
   593  		p := s.Prog(asm)
   594  		p.From.Type = obj.TYPE_MEM
   595  		p.From.Reg = a
   596  		p.From.Offset = v.AuxInt
   597  		p.To.Type = obj.TYPE_REG
   598  		p.To.Reg = r
   599  
   600  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
   601  		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
   602  		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
   603  		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
   604  		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
   605  		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
   606  		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
   607  		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
   608  		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
   609  		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
   610  		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
   611  		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
   612  		p := s.Prog(v.Op.Asm())
   613  		p.From.Type = obj.TYPE_REG
   614  		p.From.Reg = v.Args[1].Reg()
   615  		p.To.Type = obj.TYPE_REG
   616  		p.To.Reg = v.Reg()
   617  
   618  	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
   619  		// Flag condition: ^ZERO || PARITY
   620  		// Generate:
   621  		//   CMOV*NE  SRC,DST
   622  		//   CMOV*PS  SRC,DST
   623  		p := s.Prog(v.Op.Asm())
   624  		p.From.Type = obj.TYPE_REG
   625  		p.From.Reg = v.Args[1].Reg()
   626  		p.To.Type = obj.TYPE_REG
   627  		p.To.Reg = v.Reg()
   628  		var q *obj.Prog
   629  		if v.Op == ssa.OpAMD64CMOVQNEF {
   630  			q = s.Prog(x86.ACMOVQPS)
   631  		} else if v.Op == ssa.OpAMD64CMOVLNEF {
   632  			q = s.Prog(x86.ACMOVLPS)
   633  		} else {
   634  			q = s.Prog(x86.ACMOVWPS)
   635  		}
   636  		q.From.Type = obj.TYPE_REG
   637  		q.From.Reg = v.Args[1].Reg()
   638  		q.To.Type = obj.TYPE_REG
   639  		q.To.Reg = v.Reg()
   640  
   641  	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
   642  		// Flag condition: ZERO && !PARITY
   643  		// Generate:
   644  		//   MOV      SRC,TMP
   645  		//   CMOV*NE  DST,TMP
   646  		//   CMOV*PC  TMP,DST
   647  		//
   648  		// TODO(rasky): we could generate:
   649  		//   CMOV*NE  DST,SRC
   650  		//   CMOV*PC  SRC,DST
   651  		// But this requires a way for regalloc to know that SRC might be
   652  		// clobbered by this instruction.
   653  		t := v.RegTmp()
   654  		opregreg(s, moveByRegsWidth(t, v.Args[1].Reg(), v.Type.Size()), t, v.Args[1].Reg())
   655  
   656  		p := s.Prog(v.Op.Asm())
   657  		p.From.Type = obj.TYPE_REG
   658  		p.From.Reg = v.Reg()
   659  		p.To.Type = obj.TYPE_REG
   660  		p.To.Reg = t
   661  		var q *obj.Prog
   662  		if v.Op == ssa.OpAMD64CMOVQEQF {
   663  			q = s.Prog(x86.ACMOVQPC)
   664  		} else if v.Op == ssa.OpAMD64CMOVLEQF {
   665  			q = s.Prog(x86.ACMOVLPC)
   666  		} else {
   667  			q = s.Prog(x86.ACMOVWPC)
   668  		}
   669  		q.From.Type = obj.TYPE_REG
   670  		q.From.Reg = t
   671  		q.To.Type = obj.TYPE_REG
   672  		q.To.Reg = v.Reg()
   673  
   674  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   675  		r := v.Reg()
   676  		p := s.Prog(v.Op.Asm())
   677  		p.From.Type = obj.TYPE_CONST
   678  		p.From.Offset = v.AuxInt
   679  		p.To.Type = obj.TYPE_REG
   680  		p.To.Reg = r
   681  		p.AddRestSourceReg(v.Args[0].Reg())
   682  
   683  	case ssa.OpAMD64ANDQconst:
   684  		asm := v.Op.Asm()
   685  		// If the constant is positive and fits into 32 bits, use ANDL.
   686  		// This saves a few bytes of encoding.
   687  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   688  			asm = x86.AANDL
   689  		}
   690  		p := s.Prog(asm)
   691  		p.From.Type = obj.TYPE_CONST
   692  		p.From.Offset = v.AuxInt
   693  		p.To.Type = obj.TYPE_REG
   694  		p.To.Reg = v.Reg()
   695  
   696  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   697  		ssa.OpAMD64ANDLconst,
   698  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   699  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   700  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   701  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   702  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   703  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   704  		p := s.Prog(v.Op.Asm())
   705  		p.From.Type = obj.TYPE_CONST
   706  		p.From.Offset = v.AuxInt
   707  		p.To.Type = obj.TYPE_REG
   708  		p.To.Reg = v.Reg()
   709  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   710  		r := v.Reg()
   711  		p := s.Prog(v.Op.Asm())
   712  		p.From.Type = obj.TYPE_REG
   713  		p.From.Reg = r
   714  		p.To.Type = obj.TYPE_REG
   715  		p.To.Reg = r
   716  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
   717  		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
   718  		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   719  		p := s.Prog(v.Op.Asm())
   720  		memIdx(&p.From, v)
   721  		o := v.Reg()
   722  		p.To.Type = obj.TYPE_REG
   723  		p.To.Reg = o
   724  		if v.AuxInt != 0 && v.Aux == nil {
   725  			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
   726  			switch v.Op {
   727  			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   728  				p = s.Prog(x86.ALEAQ)
   729  			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
   730  				p = s.Prog(x86.ALEAL)
   731  			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   732  				p = s.Prog(x86.ALEAW)
   733  			}
   734  			p.From.Type = obj.TYPE_MEM
   735  			p.From.Reg = o
   736  			p.To.Type = obj.TYPE_REG
   737  			p.To.Reg = o
   738  		}
   739  		ssagen.AddAux(&p.From, v)
   740  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
   741  		p := s.Prog(v.Op.Asm())
   742  		p.From.Type = obj.TYPE_MEM
   743  		p.From.Reg = v.Args[0].Reg()
   744  		ssagen.AddAux(&p.From, v)
   745  		p.To.Type = obj.TYPE_REG
   746  		p.To.Reg = v.Reg()
   747  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   748  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
   749  		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
   750  		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
   751  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   752  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   753  		// must account for that right here.
   754  		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
   755  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   756  		p := s.Prog(v.Op.Asm())
   757  		p.From.Type = obj.TYPE_REG
   758  		p.From.Reg = v.Args[0].Reg()
   759  		p.To.Type = obj.TYPE_CONST
   760  		p.To.Offset = v.AuxInt
   761  	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
   762  		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
   763  		ssa.OpAMD64BTSQconst,
   764  		ssa.OpAMD64BTCQconst,
   765  		ssa.OpAMD64BTRQconst:
   766  		op := v.Op
   767  		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
   768  			// Emit 32-bit version because it's shorter
   769  			op = ssa.OpAMD64BTLconst
   770  		}
   771  		p := s.Prog(op.Asm())
   772  		p.From.Type = obj.TYPE_CONST
   773  		p.From.Offset = v.AuxInt
   774  		p.To.Type = obj.TYPE_REG
   775  		p.To.Reg = v.Args[0].Reg()
   776  	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
   777  		p := s.Prog(v.Op.Asm())
   778  		p.From.Type = obj.TYPE_MEM
   779  		p.From.Reg = v.Args[0].Reg()
   780  		ssagen.AddAux(&p.From, v)
   781  		p.To.Type = obj.TYPE_REG
   782  		p.To.Reg = v.Args[1].Reg()
   783  	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
   784  		sc := v.AuxValAndOff()
   785  		p := s.Prog(v.Op.Asm())
   786  		p.From.Type = obj.TYPE_MEM
   787  		p.From.Reg = v.Args[0].Reg()
   788  		ssagen.AddAux2(&p.From, v, sc.Off64())
   789  		p.To.Type = obj.TYPE_CONST
   790  		p.To.Offset = sc.Val64()
   791  	case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1:
   792  		p := s.Prog(v.Op.Asm())
   793  		memIdx(&p.From, v)
   794  		ssagen.AddAux(&p.From, v)
   795  		p.To.Type = obj.TYPE_REG
   796  		p.To.Reg = v.Args[2].Reg()
   797  	case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1:
   798  		sc := v.AuxValAndOff()
   799  		p := s.Prog(v.Op.Asm())
   800  		memIdx(&p.From, v)
   801  		ssagen.AddAux2(&p.From, v, sc.Off64())
   802  		p.To.Type = obj.TYPE_CONST
   803  		p.To.Offset = sc.Val64()
   804  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   805  		x := v.Reg()
   806  
   807  		// If flags aren't live (indicated by v.Aux == nil),
   808  		// then we can rewrite MOV $0, AX into XOR AX, AX.
   809  		if v.AuxInt == 0 && v.Aux == nil {
   810  			opregreg(s, x86.AXORL, x, x)
   811  			break
   812  		}
   813  
   814  		asm := v.Op.Asm()
   815  		// Use MOVL to move a small constant into a register
   816  		// when the constant is positive and fits into 32 bits.
   817  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   818  			// The upper 32bit are zeroed automatically when using MOVL.
   819  			asm = x86.AMOVL
   820  		}
   821  		p := s.Prog(asm)
   822  		p.From.Type = obj.TYPE_CONST
   823  		p.From.Offset = v.AuxInt
   824  		p.To.Type = obj.TYPE_REG
   825  		p.To.Reg = x
   826  
   827  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   828  		x := v.Reg()
   829  		if !isFPReg(x) && v.AuxInt == 0 && v.Aux == nil {
   830  			opregreg(s, x86.AXORL, x, x)
   831  			break
   832  		}
   833  		p := s.Prog(storeByRegWidth(x, v.Type.Size()))
   834  		p.From.Type = obj.TYPE_FCONST
   835  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   836  		p.To.Type = obj.TYPE_REG
   837  		p.To.Reg = x
   838  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
   839  		ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
   840  		ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
   841  		p := s.Prog(v.Op.Asm())
   842  		p.From.Type = obj.TYPE_MEM
   843  		p.From.Reg = v.Args[0].Reg()
   844  		ssagen.AddAux(&p.From, v)
   845  		p.To.Type = obj.TYPE_REG
   846  		p.To.Reg = v.Reg()
   847  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
   848  		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2,
   849  		ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8:
   850  		p := s.Prog(v.Op.Asm())
   851  		memIdx(&p.From, v)
   852  		ssagen.AddAux(&p.From, v)
   853  		p.To.Type = obj.TYPE_REG
   854  		p.To.Reg = v.Reg()
   855  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
   856  		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
   857  		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
   858  		ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore:
   859  		p := s.Prog(v.Op.Asm())
   860  		p.From.Type = obj.TYPE_REG
   861  		p.From.Reg = v.Args[1].Reg()
   862  		p.To.Type = obj.TYPE_MEM
   863  		p.To.Reg = v.Args[0].Reg()
   864  		ssagen.AddAux(&p.To, v)
   865  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
   866  		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
   867  		ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
   868  		ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
   869  		ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
   870  		ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
   871  		ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8,
   872  		ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8:
   873  		p := s.Prog(v.Op.Asm())
   874  		p.From.Type = obj.TYPE_REG
   875  		p.From.Reg = v.Args[2].Reg()
   876  		memIdx(&p.To, v)
   877  		ssagen.AddAux(&p.To, v)
   878  	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
   879  		sc := v.AuxValAndOff()
   880  		off := sc.Off64()
   881  		val := sc.Val()
   882  		if val == 1 || val == -1 {
   883  			var asm obj.As
   884  			if v.Op == ssa.OpAMD64ADDQconstmodify {
   885  				if val == 1 {
   886  					asm = x86.AINCQ
   887  				} else {
   888  					asm = x86.ADECQ
   889  				}
   890  			} else {
   891  				if val == 1 {
   892  					asm = x86.AINCL
   893  				} else {
   894  					asm = x86.ADECL
   895  				}
   896  			}
   897  			p := s.Prog(asm)
   898  			p.To.Type = obj.TYPE_MEM
   899  			p.To.Reg = v.Args[0].Reg()
   900  			ssagen.AddAux2(&p.To, v, off)
   901  			break
   902  		}
   903  		fallthrough
   904  	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
   905  		ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
   906  		ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
   907  		sc := v.AuxValAndOff()
   908  		off := sc.Off64()
   909  		val := sc.Val64()
   910  		p := s.Prog(v.Op.Asm())
   911  		p.From.Type = obj.TYPE_CONST
   912  		p.From.Offset = val
   913  		p.To.Type = obj.TYPE_MEM
   914  		p.To.Reg = v.Args[0].Reg()
   915  		ssagen.AddAux2(&p.To, v, off)
   916  
   917  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   918  		p := s.Prog(v.Op.Asm())
   919  		p.From.Type = obj.TYPE_CONST
   920  		sc := v.AuxValAndOff()
   921  		p.From.Offset = sc.Val64()
   922  		p.To.Type = obj.TYPE_MEM
   923  		p.To.Reg = v.Args[0].Reg()
   924  		ssagen.AddAux2(&p.To, v, sc.Off64())
   925  	case ssa.OpAMD64MOVOstoreconst:
   926  		sc := v.AuxValAndOff()
   927  		if sc.Val() != 0 {
   928  			v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString())
   929  		}
   930  
   931  		if s.ABI != obj.ABIInternal {
   932  			// zero X15 manually
   933  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   934  		}
   935  		p := s.Prog(v.Op.Asm())
   936  		p.From.Type = obj.TYPE_REG
   937  		p.From.Reg = x86.REG_X15
   938  		p.To.Type = obj.TYPE_MEM
   939  		p.To.Reg = v.Args[0].Reg()
   940  		ssagen.AddAux2(&p.To, v, sc.Off64())
   941  
   942  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
   943  		ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
   944  		ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
   945  		ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
   946  		ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
   947  		p := s.Prog(v.Op.Asm())
   948  		p.From.Type = obj.TYPE_CONST
   949  		sc := v.AuxValAndOff()
   950  		p.From.Offset = sc.Val64()
   951  		switch {
   952  		case p.As == x86.AADDQ && p.From.Offset == 1:
   953  			p.As = x86.AINCQ
   954  			p.From.Type = obj.TYPE_NONE
   955  		case p.As == x86.AADDQ && p.From.Offset == -1:
   956  			p.As = x86.ADECQ
   957  			p.From.Type = obj.TYPE_NONE
   958  		case p.As == x86.AADDL && p.From.Offset == 1:
   959  			p.As = x86.AINCL
   960  			p.From.Type = obj.TYPE_NONE
   961  		case p.As == x86.AADDL && p.From.Offset == -1:
   962  			p.As = x86.ADECL
   963  			p.From.Type = obj.TYPE_NONE
   964  		}
   965  		memIdx(&p.To, v)
   966  		ssagen.AddAux2(&p.To, v, sc.Off64())
   967  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   968  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   969  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
   970  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
   971  	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
   972  		r := v.Reg()
   973  		// Break false dependency on destination register.
   974  		opregreg(s, x86.AXORPS, r, r)
   975  		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
   976  	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   977  		var p *obj.Prog
   978  		switch v.Op {
   979  		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
   980  			p = s.Prog(x86.AMOVQ)
   981  		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   982  			p = s.Prog(x86.AMOVL)
   983  		}
   984  		p.From.Type = obj.TYPE_REG
   985  		p.From.Reg = v.Args[0].Reg()
   986  		p.To.Type = obj.TYPE_REG
   987  		p.To.Reg = v.Reg()
   988  	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
   989  		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
   990  		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
   991  		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
   992  		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
   993  		p := s.Prog(v.Op.Asm())
   994  		p.From.Type = obj.TYPE_MEM
   995  		p.From.Reg = v.Args[1].Reg()
   996  		ssagen.AddAux(&p.From, v)
   997  		p.To.Type = obj.TYPE_REG
   998  		p.To.Reg = v.Reg()
   999  	case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
  1000  		ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
  1001  		ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
  1002  		ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
  1003  		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
  1004  		ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
  1005  		ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
  1006  		ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
  1007  		ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
  1008  		p := s.Prog(v.Op.Asm())
  1009  
  1010  		r, i := v.Args[1].Reg(), v.Args[2].Reg()
  1011  		p.From.Type = obj.TYPE_MEM
  1012  		p.From.Scale = v.Op.Scale()
  1013  		if p.From.Scale == 1 && i == x86.REG_SP {
  1014  			r, i = i, r
  1015  		}
  1016  		p.From.Reg = r
  1017  		p.From.Index = i
  1018  
  1019  		ssagen.AddAux(&p.From, v)
  1020  		p.To.Type = obj.TYPE_REG
  1021  		p.To.Reg = v.Reg()
  1022  
  1023  	case ssa.OpAMD64LoweredZero:
  1024  		if s.ABI != obj.ABIInternal {
  1025  			// zero X15 manually
  1026  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1027  		}
  1028  		ptrReg := v.Args[0].Reg()
  1029  		n := v.AuxInt
  1030  		if n < 16 {
  1031  			v.Fatalf("Zero too small %d", n)
  1032  		}
  1033  		zero16 := func(off int64) {
  1034  			zero16(s, ptrReg, off)
  1035  		}
  1036  
  1037  		// Generate zeroing instructions.
  1038  		var off int64
  1039  		for n >= 16 {
  1040  			zero16(off)
  1041  			off += 16
  1042  			n -= 16
  1043  		}
  1044  		if n != 0 {
  1045  			// use partially overlapped write.
  1046  			// TODO: n <= 8, use smaller write?
  1047  			zero16(off + n - 16)
  1048  		}
  1049  
  1050  	case ssa.OpAMD64LoweredZeroLoop:
  1051  		if s.ABI != obj.ABIInternal {
  1052  			// zero X15 manually
  1053  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1054  		}
  1055  		ptrReg := v.Args[0].Reg()
  1056  		countReg := v.RegTmp()
  1057  		n := v.AuxInt
  1058  		loopSize := int64(64)
  1059  		if n < 3*loopSize {
  1060  			// - a loop count of 0 won't work.
  1061  			// - a loop count of 1 is useless.
  1062  			// - a loop count of 2 is a code size ~tie
  1063  			//     4 instructions to implement the loop
  1064  			//     4 instructions in the loop body
  1065  			//   vs
  1066  			//     8 instructions in the straightline code
  1067  			//   Might as well use straightline code.
  1068  			v.Fatalf("ZeroLoop size too small %d", n)
  1069  		}
  1070  		zero16 := func(off int64) {
  1071  			zero16(s, ptrReg, off)
  1072  		}
  1073  
  1074  		// Put iteration count in a register.
  1075  		//   MOVL    $n, countReg
  1076  		p := s.Prog(x86.AMOVL)
  1077  		p.From.Type = obj.TYPE_CONST
  1078  		p.From.Offset = n / loopSize
  1079  		p.To.Type = obj.TYPE_REG
  1080  		p.To.Reg = countReg
  1081  		cntInit := p
  1082  
  1083  		// Zero loopSize bytes starting at ptrReg.
  1084  		for i := range loopSize / 16 {
  1085  			zero16(i * 16)
  1086  		}
  1087  		//   ADDQ    $loopSize, ptrReg
  1088  		p = s.Prog(x86.AADDQ)
  1089  		p.From.Type = obj.TYPE_CONST
  1090  		p.From.Offset = loopSize
  1091  		p.To.Type = obj.TYPE_REG
  1092  		p.To.Reg = ptrReg
  1093  		//   DECL    countReg
  1094  		p = s.Prog(x86.ADECL)
  1095  		p.To.Type = obj.TYPE_REG
  1096  		p.To.Reg = countReg
  1097  		// Jump to first instruction in loop if we're not done yet.
  1098  		//   JNE     head
  1099  		p = s.Prog(x86.AJNE)
  1100  		p.To.Type = obj.TYPE_BRANCH
  1101  		p.To.SetTarget(cntInit.Link)
  1102  
  1103  		// Multiples of the loop size are now done.
  1104  		n %= loopSize
  1105  
  1106  		// Write any fractional portion.
  1107  		var off int64
  1108  		for n >= 16 {
  1109  			zero16(off)
  1110  			off += 16
  1111  			n -= 16
  1112  		}
  1113  		if n != 0 {
  1114  			// Use partially-overlapping write.
  1115  			// TODO: n <= 8, use smaller write?
  1116  			zero16(off + n - 16)
  1117  		}
  1118  
  1119  	case ssa.OpAMD64LoweredMove:
  1120  		dstReg := v.Args[0].Reg()
  1121  		srcReg := v.Args[1].Reg()
  1122  		if dstReg == srcReg {
  1123  			break
  1124  		}
  1125  		tmpReg := int16(x86.REG_X14)
  1126  		n := v.AuxInt
  1127  		if n < 16 {
  1128  			v.Fatalf("Move too small %d", n)
  1129  		}
  1130  		// move 16 bytes from srcReg+off to dstReg+off.
  1131  		move16 := func(off int64) {
  1132  			move16(s, srcReg, dstReg, tmpReg, off)
  1133  		}
  1134  
  1135  		// Generate copying instructions.
  1136  		var off int64
  1137  		for n >= 16 {
  1138  			move16(off)
  1139  			off += 16
  1140  			n -= 16
  1141  		}
  1142  		if n != 0 {
  1143  			// use partially overlapped read/write.
  1144  			// TODO: use smaller operations when we can?
  1145  			move16(off + n - 16)
  1146  		}
  1147  
  1148  	case ssa.OpAMD64LoweredMoveLoop:
  1149  		dstReg := v.Args[0].Reg()
  1150  		srcReg := v.Args[1].Reg()
  1151  		if dstReg == srcReg {
  1152  			break
  1153  		}
  1154  		countReg := v.RegTmp()
  1155  		tmpReg := int16(x86.REG_X14)
  1156  		n := v.AuxInt
  1157  		loopSize := int64(64)
  1158  		if n < 3*loopSize {
  1159  			// - a loop count of 0 won't work.
  1160  			// - a loop count of 1 is useless.
  1161  			// - a loop count of 2 is a code size ~tie
  1162  			//     4 instructions to implement the loop
  1163  			//     4 instructions in the loop body
  1164  			//   vs
  1165  			//     8 instructions in the straightline code
  1166  			//   Might as well use straightline code.
  1167  			v.Fatalf("ZeroLoop size too small %d", n)
  1168  		}
  1169  		// move 16 bytes from srcReg+off to dstReg+off.
  1170  		move16 := func(off int64) {
  1171  			move16(s, srcReg, dstReg, tmpReg, off)
  1172  		}
  1173  
  1174  		// Put iteration count in a register.
  1175  		//   MOVL    $n, countReg
  1176  		p := s.Prog(x86.AMOVL)
  1177  		p.From.Type = obj.TYPE_CONST
  1178  		p.From.Offset = n / loopSize
  1179  		p.To.Type = obj.TYPE_REG
  1180  		p.To.Reg = countReg
  1181  		cntInit := p
  1182  
  1183  		// Copy loopSize bytes starting at srcReg to dstReg.
  1184  		for i := range loopSize / 16 {
  1185  			move16(i * 16)
  1186  		}
  1187  		//   ADDQ    $loopSize, srcReg
  1188  		p = s.Prog(x86.AADDQ)
  1189  		p.From.Type = obj.TYPE_CONST
  1190  		p.From.Offset = loopSize
  1191  		p.To.Type = obj.TYPE_REG
  1192  		p.To.Reg = srcReg
  1193  		//   ADDQ    $loopSize, dstReg
  1194  		p = s.Prog(x86.AADDQ)
  1195  		p.From.Type = obj.TYPE_CONST
  1196  		p.From.Offset = loopSize
  1197  		p.To.Type = obj.TYPE_REG
  1198  		p.To.Reg = dstReg
  1199  		//   DECL    countReg
  1200  		p = s.Prog(x86.ADECL)
  1201  		p.To.Type = obj.TYPE_REG
  1202  		p.To.Reg = countReg
  1203  		// Jump to loop header if we're not done yet.
  1204  		//   JNE     head
  1205  		p = s.Prog(x86.AJNE)
  1206  		p.To.Type = obj.TYPE_BRANCH
  1207  		p.To.SetTarget(cntInit.Link)
  1208  
  1209  		// Multiples of the loop size are now done.
  1210  		n %= loopSize
  1211  
  1212  		// Copy any fractional portion.
  1213  		var off int64
  1214  		for n >= 16 {
  1215  			move16(off)
  1216  			off += 16
  1217  			n -= 16
  1218  		}
  1219  		if n != 0 {
  1220  			// Use partially-overlapping copy.
  1221  			move16(off + n - 16)
  1222  		}
  1223  
  1224  	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
  1225  		if v.Type.IsMemory() {
  1226  			return
  1227  		}
  1228  		x := v.Args[0].Reg()
  1229  		y := v.Reg()
  1230  		if v.Type.IsSIMD() {
  1231  			x = simdOrMaskReg(v.Args[0])
  1232  			y = simdOrMaskReg(v)
  1233  		}
  1234  		if x != y {
  1235  			opregreg(s, moveByRegsWidth(y, x, v.Type.Size()), y, x)
  1236  		}
  1237  	case ssa.OpLoadReg:
  1238  		if v.Type.IsFlags() {
  1239  			v.Fatalf("load flags not implemented: %v", v.LongString())
  1240  			return
  1241  		}
  1242  		r := v.Reg()
  1243  		p := s.Prog(loadByRegWidth(r, v.Type.Size()))
  1244  		ssagen.AddrAuto(&p.From, v.Args[0])
  1245  		p.To.Type = obj.TYPE_REG
  1246  		if v.Type.IsSIMD() {
  1247  			r = simdOrMaskReg(v)
  1248  		}
  1249  		p.To.Reg = r
  1250  
  1251  	case ssa.OpStoreReg:
  1252  		if v.Type.IsFlags() {
  1253  			v.Fatalf("store flags not implemented: %v", v.LongString())
  1254  			return
  1255  		}
  1256  		r := v.Args[0].Reg()
  1257  		if v.Type.IsSIMD() {
  1258  			r = simdOrMaskReg(v.Args[0])
  1259  		}
  1260  		p := s.Prog(storeByRegWidth(r, v.Type.Size()))
  1261  		p.From.Type = obj.TYPE_REG
  1262  		p.From.Reg = r
  1263  		ssagen.AddrAuto(&p.To, v)
  1264  	case ssa.OpAMD64LoweredHasCPUFeature:
  1265  		p := s.Prog(x86.AMOVBLZX)
  1266  		p.From.Type = obj.TYPE_MEM
  1267  		ssagen.AddAux(&p.From, v)
  1268  		p.To.Type = obj.TYPE_REG
  1269  		p.To.Reg = v.Reg()
  1270  	case ssa.OpArgIntReg, ssa.OpArgFloatReg:
  1271  		// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
  1272  		// The loop only runs once.
  1273  		for _, ap := range v.Block.Func.RegArgs {
  1274  			// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
  1275  			addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
  1276  			reg := ap.Reg
  1277  			t := ap.Type
  1278  			sz := t.Size()
  1279  			if t.IsSIMD() {
  1280  				reg = simdRegBySize(reg, sz)
  1281  			}
  1282  			s.FuncInfo().AddSpill(
  1283  				obj.RegSpill{Reg: reg, Addr: addr, Unspill: loadByRegWidth(reg, sz), Spill: storeByRegWidth(reg, sz)})
  1284  		}
  1285  		v.Block.Func.RegArgs = nil
  1286  		ssagen.CheckArgReg(v)
  1287  	case ssa.OpAMD64LoweredGetClosurePtr:
  1288  		// Closure pointer is DX.
  1289  		ssagen.CheckLoweredGetClosurePtr(v)
  1290  	case ssa.OpAMD64LoweredGetG:
  1291  		if s.ABI == obj.ABIInternal {
  1292  			v.Fatalf("LoweredGetG should not appear in ABIInternal")
  1293  		}
  1294  		r := v.Reg()
  1295  		getgFromTLS(s, r)
  1296  	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
  1297  		if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
  1298  			// zeroing X15 when entering ABIInternal from ABI0
  1299  			zeroX15(s)
  1300  			// set G register from TLS
  1301  			getgFromTLS(s, x86.REG_R14)
  1302  		}
  1303  		if v.Op == ssa.OpAMD64CALLtail {
  1304  			s.TailCall(v)
  1305  			break
  1306  		}
  1307  		s.Call(v)
  1308  		if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
  1309  			// zeroing X15 when entering ABIInternal from ABI0
  1310  			zeroX15(s)
  1311  			// set G register from TLS
  1312  			getgFromTLS(s, x86.REG_R14)
  1313  		}
  1314  	case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
  1315  		s.Call(v)
  1316  
  1317  	case ssa.OpAMD64LoweredGetCallerPC:
  1318  		p := s.Prog(x86.AMOVQ)
  1319  		p.From.Type = obj.TYPE_MEM
  1320  		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
  1321  		p.From.Name = obj.NAME_PARAM
  1322  		p.To.Type = obj.TYPE_REG
  1323  		p.To.Reg = v.Reg()
  1324  
  1325  	case ssa.OpAMD64LoweredGetCallerSP:
  1326  		// caller's SP is the address of the first arg
  1327  		mov := x86.AMOVQ
  1328  		if types.PtrSize == 4 {
  1329  			mov = x86.AMOVL
  1330  		}
  1331  		p := s.Prog(mov)
  1332  		p.From.Type = obj.TYPE_ADDR
  1333  		p.From.Offset = -base.Ctxt.Arch.FixedFrameSize // 0 on amd64, just to be consistent with other architectures
  1334  		p.From.Name = obj.NAME_PARAM
  1335  		p.To.Type = obj.TYPE_REG
  1336  		p.To.Reg = v.Reg()
  1337  
  1338  	case ssa.OpAMD64LoweredWB:
  1339  		p := s.Prog(obj.ACALL)
  1340  		p.To.Type = obj.TYPE_MEM
  1341  		p.To.Name = obj.NAME_EXTERN
  1342  		// AuxInt encodes how many buffer entries we need.
  1343  		p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
  1344  
  1345  	case ssa.OpAMD64LoweredPanicBoundsRR, ssa.OpAMD64LoweredPanicBoundsRC, ssa.OpAMD64LoweredPanicBoundsCR, ssa.OpAMD64LoweredPanicBoundsCC:
  1346  		// Compute the constant we put in the PCData entry for this call.
  1347  		code, signed := ssa.BoundsKind(v.AuxInt).Code()
  1348  		xIsReg := false
  1349  		yIsReg := false
  1350  		xVal := 0
  1351  		yVal := 0
  1352  		switch v.Op {
  1353  		case ssa.OpAMD64LoweredPanicBoundsRR:
  1354  			xIsReg = true
  1355  			xVal = int(v.Args[0].Reg() - x86.REG_AX)
  1356  			yIsReg = true
  1357  			yVal = int(v.Args[1].Reg() - x86.REG_AX)
  1358  		case ssa.OpAMD64LoweredPanicBoundsRC:
  1359  			xIsReg = true
  1360  			xVal = int(v.Args[0].Reg() - x86.REG_AX)
  1361  			c := v.Aux.(ssa.PanicBoundsC).C
  1362  			if c >= 0 && c <= abi.BoundsMaxConst {
  1363  				yVal = int(c)
  1364  			} else {
  1365  				// Move constant to a register
  1366  				yIsReg = true
  1367  				if yVal == xVal {
  1368  					yVal = 1
  1369  				}
  1370  				p := s.Prog(x86.AMOVQ)
  1371  				p.From.Type = obj.TYPE_CONST
  1372  				p.From.Offset = c
  1373  				p.To.Type = obj.TYPE_REG
  1374  				p.To.Reg = x86.REG_AX + int16(yVal)
  1375  			}
  1376  		case ssa.OpAMD64LoweredPanicBoundsCR:
  1377  			yIsReg = true
  1378  			yVal = int(v.Args[0].Reg() - x86.REG_AX)
  1379  			c := v.Aux.(ssa.PanicBoundsC).C
  1380  			if c >= 0 && c <= abi.BoundsMaxConst {
  1381  				xVal = int(c)
  1382  			} else {
  1383  				// Move constant to a register
  1384  				xIsReg = true
  1385  				if xVal == yVal {
  1386  					xVal = 1
  1387  				}
  1388  				p := s.Prog(x86.AMOVQ)
  1389  				p.From.Type = obj.TYPE_CONST
  1390  				p.From.Offset = c
  1391  				p.To.Type = obj.TYPE_REG
  1392  				p.To.Reg = x86.REG_AX + int16(xVal)
  1393  			}
  1394  		case ssa.OpAMD64LoweredPanicBoundsCC:
  1395  			c := v.Aux.(ssa.PanicBoundsCC).Cx
  1396  			if c >= 0 && c <= abi.BoundsMaxConst {
  1397  				xVal = int(c)
  1398  			} else {
  1399  				// Move constant to a register
  1400  				xIsReg = true
  1401  				p := s.Prog(x86.AMOVQ)
  1402  				p.From.Type = obj.TYPE_CONST
  1403  				p.From.Offset = c
  1404  				p.To.Type = obj.TYPE_REG
  1405  				p.To.Reg = x86.REG_AX + int16(xVal)
  1406  			}
  1407  			c = v.Aux.(ssa.PanicBoundsCC).Cy
  1408  			if c >= 0 && c <= abi.BoundsMaxConst {
  1409  				yVal = int(c)
  1410  			} else {
  1411  				// Move constant to a register
  1412  				yIsReg = true
  1413  				yVal = 1
  1414  				p := s.Prog(x86.AMOVQ)
  1415  				p.From.Type = obj.TYPE_CONST
  1416  				p.From.Offset = c
  1417  				p.To.Type = obj.TYPE_REG
  1418  				p.To.Reg = x86.REG_AX + int16(yVal)
  1419  			}
  1420  		}
  1421  		c := abi.BoundsEncode(code, signed, xIsReg, yIsReg, xVal, yVal)
  1422  
  1423  		p := s.Prog(obj.APCDATA)
  1424  		p.From.SetConst(abi.PCDATA_PanicBounds)
  1425  		p.To.SetConst(int64(c))
  1426  		p = s.Prog(obj.ACALL)
  1427  		p.To.Type = obj.TYPE_MEM
  1428  		p.To.Name = obj.NAME_EXTERN
  1429  		p.To.Sym = ir.Syms.PanicBounds
  1430  
  1431  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
  1432  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
  1433  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
  1434  		p := s.Prog(v.Op.Asm())
  1435  		p.To.Type = obj.TYPE_REG
  1436  		p.To.Reg = v.Reg()
  1437  
  1438  	case ssa.OpAMD64NEGLflags:
  1439  		p := s.Prog(v.Op.Asm())
  1440  		p.To.Type = obj.TYPE_REG
  1441  		p.To.Reg = v.Reg0()
  1442  
  1443  	case ssa.OpAMD64ADDQconstflags, ssa.OpAMD64ADDLconstflags:
  1444  		p := s.Prog(v.Op.Asm())
  1445  		p.From.Type = obj.TYPE_CONST
  1446  		p.From.Offset = v.AuxInt
  1447  		// Note: the inc/dec instructions do not modify
  1448  		// the carry flag like add$1 / sub$1 do.
  1449  		// We currently never use the CF/OF flags from
  1450  		// these instructions, so that is ok.
  1451  		switch {
  1452  		case p.As == x86.AADDQ && p.From.Offset == 1:
  1453  			p.As = x86.AINCQ
  1454  			p.From.Type = obj.TYPE_NONE
  1455  		case p.As == x86.AADDQ && p.From.Offset == -1:
  1456  			p.As = x86.ADECQ
  1457  			p.From.Type = obj.TYPE_NONE
  1458  		case p.As == x86.AADDL && p.From.Offset == 1:
  1459  			p.As = x86.AINCL
  1460  			p.From.Type = obj.TYPE_NONE
  1461  		case p.As == x86.AADDL && p.From.Offset == -1:
  1462  			p.As = x86.ADECL
  1463  			p.From.Type = obj.TYPE_NONE
  1464  		}
  1465  		p.To.Type = obj.TYPE_REG
  1466  		p.To.Reg = v.Reg0()
  1467  
  1468  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1469  		p := s.Prog(v.Op.Asm())
  1470  		p.From.Type = obj.TYPE_REG
  1471  		p.From.Reg = v.Args[0].Reg()
  1472  		p.To.Type = obj.TYPE_REG
  1473  		switch v.Op {
  1474  		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
  1475  			p.To.Reg = v.Reg0()
  1476  		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1477  			p.To.Reg = v.Reg()
  1478  		}
  1479  	case ssa.OpAMD64LoweredRound32F, ssa.OpAMD64LoweredRound64F:
  1480  		// input is already rounded
  1481  	case ssa.OpAMD64ROUNDSD:
  1482  		p := s.Prog(v.Op.Asm())
  1483  		val := v.AuxInt
  1484  		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
  1485  		if val < 0 || val > 3 {
  1486  			v.Fatalf("Invalid rounding mode")
  1487  		}
  1488  		p.From.Offset = val
  1489  		p.From.Type = obj.TYPE_CONST
  1490  		p.AddRestSourceReg(v.Args[0].Reg())
  1491  		p.To.Type = obj.TYPE_REG
  1492  		p.To.Reg = v.Reg()
  1493  	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
  1494  		ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
  1495  		ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
  1496  		if v.Args[0].Reg() != v.Reg() {
  1497  			// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
  1498  			// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
  1499  			// Xor register with itself to break the dependency.
  1500  			opregreg(s, x86.AXORL, v.Reg(), v.Reg())
  1501  		}
  1502  		p := s.Prog(v.Op.Asm())
  1503  		p.From.Type = obj.TYPE_REG
  1504  		p.From.Reg = v.Args[0].Reg()
  1505  		p.To.Type = obj.TYPE_REG
  1506  		p.To.Reg = v.Reg()
  1507  
  1508  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
  1509  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
  1510  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
  1511  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
  1512  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
  1513  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
  1514  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
  1515  		ssa.OpAMD64SETO:
  1516  		p := s.Prog(v.Op.Asm())
  1517  		p.To.Type = obj.TYPE_REG
  1518  		p.To.Reg = v.Reg()
  1519  
  1520  	case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
  1521  		ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
  1522  		ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
  1523  		ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
  1524  		ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
  1525  		p := s.Prog(v.Op.Asm())
  1526  		p.To.Type = obj.TYPE_MEM
  1527  		p.To.Reg = v.Args[0].Reg()
  1528  		ssagen.AddAux(&p.To, v)
  1529  
  1530  	case ssa.OpAMD64SETEQstoreidx1, ssa.OpAMD64SETNEstoreidx1,
  1531  		ssa.OpAMD64SETLstoreidx1, ssa.OpAMD64SETLEstoreidx1,
  1532  		ssa.OpAMD64SETGstoreidx1, ssa.OpAMD64SETGEstoreidx1,
  1533  		ssa.OpAMD64SETBstoreidx1, ssa.OpAMD64SETBEstoreidx1,
  1534  		ssa.OpAMD64SETAstoreidx1, ssa.OpAMD64SETAEstoreidx1:
  1535  		p := s.Prog(v.Op.Asm())
  1536  		memIdx(&p.To, v)
  1537  		ssagen.AddAux(&p.To, v)
  1538  
  1539  	case ssa.OpAMD64SETNEF:
  1540  		t := v.RegTmp()
  1541  		p := s.Prog(v.Op.Asm())
  1542  		p.To.Type = obj.TYPE_REG
  1543  		p.To.Reg = v.Reg()
  1544  		q := s.Prog(x86.ASETPS)
  1545  		q.To.Type = obj.TYPE_REG
  1546  		q.To.Reg = t
  1547  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
  1548  		opregreg(s, x86.AORL, v.Reg(), t)
  1549  
  1550  	case ssa.OpAMD64SETEQF:
  1551  		t := v.RegTmp()
  1552  		p := s.Prog(v.Op.Asm())
  1553  		p.To.Type = obj.TYPE_REG
  1554  		p.To.Reg = v.Reg()
  1555  		q := s.Prog(x86.ASETPC)
  1556  		q.To.Type = obj.TYPE_REG
  1557  		q.To.Reg = t
  1558  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
  1559  		opregreg(s, x86.AANDL, v.Reg(), t)
  1560  
  1561  	case ssa.OpAMD64InvertFlags:
  1562  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1563  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
  1564  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1565  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
  1566  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
  1567  	case ssa.OpAMD64REPSTOSQ:
  1568  		s.Prog(x86.AREP)
  1569  		s.Prog(x86.ASTOSQ)
  1570  	case ssa.OpAMD64REPMOVSQ:
  1571  		s.Prog(x86.AREP)
  1572  		s.Prog(x86.AMOVSQ)
  1573  	case ssa.OpAMD64LoweredNilCheck:
  1574  		// Issue a load which will fault if the input is nil.
  1575  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
  1576  		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
  1577  		// but it doesn't have false dependency on AX.
  1578  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
  1579  		// That trades clobbering flags for clobbering a register.
  1580  		p := s.Prog(x86.ATESTB)
  1581  		p.From.Type = obj.TYPE_REG
  1582  		p.From.Reg = x86.REG_AX
  1583  		p.To.Type = obj.TYPE_MEM
  1584  		p.To.Reg = v.Args[0].Reg()
  1585  		if logopt.Enabled() {
  1586  			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
  1587  		}
  1588  		if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
  1589  			base.WarnfAt(v.Pos, "generated nil check")
  1590  		}
  1591  	case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
  1592  		p := s.Prog(v.Op.Asm())
  1593  		p.From.Type = obj.TYPE_MEM
  1594  		p.From.Reg = v.Args[0].Reg()
  1595  		ssagen.AddAux(&p.From, v)
  1596  		p.To.Type = obj.TYPE_REG
  1597  		p.To.Reg = v.Reg0()
  1598  	case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
  1599  		p := s.Prog(v.Op.Asm())
  1600  		p.From.Type = obj.TYPE_REG
  1601  		p.From.Reg = v.Reg0()
  1602  		p.To.Type = obj.TYPE_MEM
  1603  		p.To.Reg = v.Args[1].Reg()
  1604  		ssagen.AddAux(&p.To, v)
  1605  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1606  		s.Prog(x86.ALOCK)
  1607  		p := s.Prog(v.Op.Asm())
  1608  		p.From.Type = obj.TYPE_REG
  1609  		p.From.Reg = v.Reg0()
  1610  		p.To.Type = obj.TYPE_MEM
  1611  		p.To.Reg = v.Args[1].Reg()
  1612  		ssagen.AddAux(&p.To, v)
  1613  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1614  		if v.Args[1].Reg() != x86.REG_AX {
  1615  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1616  		}
  1617  		s.Prog(x86.ALOCK)
  1618  		p := s.Prog(v.Op.Asm())
  1619  		p.From.Type = obj.TYPE_REG
  1620  		p.From.Reg = v.Args[2].Reg()
  1621  		p.To.Type = obj.TYPE_MEM
  1622  		p.To.Reg = v.Args[0].Reg()
  1623  		ssagen.AddAux(&p.To, v)
  1624  		p = s.Prog(x86.ASETEQ)
  1625  		p.To.Type = obj.TYPE_REG
  1626  		p.To.Reg = v.Reg0()
  1627  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock:
  1628  		// Atomic memory operations that don't need to return the old value.
  1629  		s.Prog(x86.ALOCK)
  1630  		p := s.Prog(v.Op.Asm())
  1631  		p.From.Type = obj.TYPE_REG
  1632  		p.From.Reg = v.Args[1].Reg()
  1633  		p.To.Type = obj.TYPE_MEM
  1634  		p.To.Reg = v.Args[0].Reg()
  1635  		ssagen.AddAux(&p.To, v)
  1636  	case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32:
  1637  		// Atomic memory operations that need to return the old value.
  1638  		// We need to do these with compare-and-exchange to get access to the old value.
  1639  		// loop:
  1640  		// MOVQ mask, tmp
  1641  		// MOVQ (addr), AX
  1642  		// ANDQ AX, tmp
  1643  		// LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against
  1644  		// JNE loop
  1645  		// : result in AX
  1646  		mov := x86.AMOVQ
  1647  		op := x86.AANDQ
  1648  		cmpxchg := x86.ACMPXCHGQ
  1649  		switch v.Op {
  1650  		case ssa.OpAMD64LoweredAtomicOr64:
  1651  			op = x86.AORQ
  1652  		case ssa.OpAMD64LoweredAtomicAnd32:
  1653  			mov = x86.AMOVL
  1654  			op = x86.AANDL
  1655  			cmpxchg = x86.ACMPXCHGL
  1656  		case ssa.OpAMD64LoweredAtomicOr32:
  1657  			mov = x86.AMOVL
  1658  			op = x86.AORL
  1659  			cmpxchg = x86.ACMPXCHGL
  1660  		}
  1661  		addr := v.Args[0].Reg()
  1662  		mask := v.Args[1].Reg()
  1663  		tmp := v.RegTmp()
  1664  		p1 := s.Prog(mov)
  1665  		p1.From.Type = obj.TYPE_REG
  1666  		p1.From.Reg = mask
  1667  		p1.To.Type = obj.TYPE_REG
  1668  		p1.To.Reg = tmp
  1669  		p2 := s.Prog(mov)
  1670  		p2.From.Type = obj.TYPE_MEM
  1671  		p2.From.Reg = addr
  1672  		ssagen.AddAux(&p2.From, v)
  1673  		p2.To.Type = obj.TYPE_REG
  1674  		p2.To.Reg = x86.REG_AX
  1675  		p3 := s.Prog(op)
  1676  		p3.From.Type = obj.TYPE_REG
  1677  		p3.From.Reg = x86.REG_AX
  1678  		p3.To.Type = obj.TYPE_REG
  1679  		p3.To.Reg = tmp
  1680  		s.Prog(x86.ALOCK)
  1681  		p5 := s.Prog(cmpxchg)
  1682  		p5.From.Type = obj.TYPE_REG
  1683  		p5.From.Reg = tmp
  1684  		p5.To.Type = obj.TYPE_MEM
  1685  		p5.To.Reg = addr
  1686  		ssagen.AddAux(&p5.To, v)
  1687  		p6 := s.Prog(x86.AJNE)
  1688  		p6.To.Type = obj.TYPE_BRANCH
  1689  		p6.To.SetTarget(p1)
  1690  	case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
  1691  		p := s.Prog(v.Op.Asm())
  1692  		p.From.Type = obj.TYPE_MEM
  1693  		p.From.Reg = v.Args[0].Reg()
  1694  	case ssa.OpClobber:
  1695  		p := s.Prog(x86.AMOVL)
  1696  		p.From.Type = obj.TYPE_CONST
  1697  		p.From.Offset = 0xdeaddead
  1698  		p.To.Type = obj.TYPE_MEM
  1699  		p.To.Reg = x86.REG_SP
  1700  		ssagen.AddAux(&p.To, v)
  1701  		p = s.Prog(x86.AMOVL)
  1702  		p.From.Type = obj.TYPE_CONST
  1703  		p.From.Offset = 0xdeaddead
  1704  		p.To.Type = obj.TYPE_MEM
  1705  		p.To.Reg = x86.REG_SP
  1706  		ssagen.AddAux(&p.To, v)
  1707  		p.To.Offset += 4
  1708  	case ssa.OpClobberReg:
  1709  		x := uint64(0xdeaddeaddeaddead)
  1710  		p := s.Prog(x86.AMOVQ)
  1711  		p.From.Type = obj.TYPE_CONST
  1712  		p.From.Offset = int64(x)
  1713  		p.To.Type = obj.TYPE_REG
  1714  		p.To.Reg = v.Reg()
  1715  
  1716  	// SIMD ops
  1717  	case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
  1718  		s.Prog(v.Op.Asm())
  1719  
  1720  	case ssa.OpAMD64Zero128: // no code emitted
  1721  
  1722  	case ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
  1723  		p := s.Prog(v.Op.Asm())
  1724  		p.From.Type = obj.TYPE_REG
  1725  		p.From.Reg = simdReg(v)
  1726  		p.AddRestSourceReg(simdReg(v))
  1727  		p.To.Type = obj.TYPE_REG
  1728  		p.To.Reg = simdReg(v)
  1729  
  1730  	case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
  1731  		// These are for initializing the least 32/64 bits of a SIMD register from a "float".
  1732  		p := s.Prog(v.Op.Asm())
  1733  		p.From.Type = obj.TYPE_REG
  1734  		p.From.Reg = v.Args[0].Reg()
  1735  		p.AddRestSourceReg(x86.REG_X15)
  1736  		p.To.Type = obj.TYPE_REG
  1737  		p.To.Reg = simdReg(v)
  1738  
  1739  	case ssa.OpAMD64VMOVQload, ssa.OpAMD64VMOVDload,
  1740  		ssa.OpAMD64VMOVSSload, ssa.OpAMD64VMOVSDload:
  1741  		p := s.Prog(v.Op.Asm())
  1742  		p.From.Type = obj.TYPE_MEM
  1743  		p.From.Reg = v.Args[0].Reg()
  1744  		ssagen.AddAux(&p.From, v)
  1745  		p.To.Type = obj.TYPE_REG
  1746  		p.To.Reg = simdReg(v)
  1747  
  1748  	case ssa.OpAMD64VMOVSSconst, ssa.OpAMD64VMOVSDconst:
  1749  		// for loading constants directly into SIMD registers
  1750  		x := simdReg(v)
  1751  		p := s.Prog(v.Op.Asm())
  1752  		p.From.Type = obj.TYPE_FCONST
  1753  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
  1754  		p.To.Type = obj.TYPE_REG
  1755  		p.To.Reg = x
  1756  
  1757  	case ssa.OpAMD64VMOVD, ssa.OpAMD64VMOVQ:
  1758  		// These are for initializing the least 32/64 bits of a SIMD register from an "int".
  1759  		p := s.Prog(v.Op.Asm())
  1760  		p.From.Type = obj.TYPE_REG
  1761  		p.From.Reg = v.Args[0].Reg()
  1762  		p.To.Type = obj.TYPE_REG
  1763  		p.To.Reg = simdReg(v)
  1764  
  1765  	case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512,
  1766  		ssa.OpAMD64KMOVBload, ssa.OpAMD64KMOVWload, ssa.OpAMD64KMOVDload, ssa.OpAMD64KMOVQload:
  1767  		p := s.Prog(v.Op.Asm())
  1768  		p.From.Type = obj.TYPE_MEM
  1769  		p.From.Reg = v.Args[0].Reg()
  1770  		ssagen.AddAux(&p.From, v)
  1771  		p.To.Type = obj.TYPE_REG
  1772  		p.To.Reg = simdOrMaskReg(v)
  1773  	case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512,
  1774  		ssa.OpAMD64KMOVBstore, ssa.OpAMD64KMOVWstore, ssa.OpAMD64KMOVDstore, ssa.OpAMD64KMOVQstore:
  1775  		p := s.Prog(v.Op.Asm())
  1776  		p.From.Type = obj.TYPE_REG
  1777  		p.From.Reg = simdOrMaskReg(v.Args[1])
  1778  		p.To.Type = obj.TYPE_MEM
  1779  		p.To.Reg = v.Args[0].Reg()
  1780  		ssagen.AddAux(&p.To, v)
  1781  
  1782  	case ssa.OpAMD64VPMASK32load128, ssa.OpAMD64VPMASK64load128, ssa.OpAMD64VPMASK32load256, ssa.OpAMD64VPMASK64load256:
  1783  		p := s.Prog(v.Op.Asm())
  1784  		p.From.Type = obj.TYPE_MEM
  1785  		p.From.Reg = v.Args[0].Reg()
  1786  		ssagen.AddAux(&p.From, v)
  1787  		p.To.Type = obj.TYPE_REG
  1788  		p.To.Reg = simdReg(v)
  1789  		p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
  1790  
  1791  	case ssa.OpAMD64VPMASK32store128, ssa.OpAMD64VPMASK64store128, ssa.OpAMD64VPMASK32store256, ssa.OpAMD64VPMASK64store256:
  1792  		p := s.Prog(v.Op.Asm())
  1793  		p.From.Type = obj.TYPE_REG
  1794  		p.From.Reg = simdReg(v.Args[2])
  1795  		p.To.Type = obj.TYPE_MEM
  1796  		p.To.Reg = v.Args[0].Reg()
  1797  		ssagen.AddAux(&p.To, v)
  1798  		p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
  1799  
  1800  	case ssa.OpAMD64VPMASK64load512, ssa.OpAMD64VPMASK32load512, ssa.OpAMD64VPMASK16load512, ssa.OpAMD64VPMASK8load512:
  1801  		p := s.Prog(v.Op.Asm())
  1802  		p.From.Type = obj.TYPE_MEM
  1803  		p.From.Reg = v.Args[0].Reg()
  1804  		ssagen.AddAux(&p.From, v)
  1805  		p.To.Type = obj.TYPE_REG
  1806  		p.To.Reg = simdReg(v)
  1807  		p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
  1808  		x86.ParseSuffix(p, "Z")             // must be zero if not in mask
  1809  
  1810  	case ssa.OpAMD64VPMASK64store512, ssa.OpAMD64VPMASK32store512, ssa.OpAMD64VPMASK16store512, ssa.OpAMD64VPMASK8store512:
  1811  		p := s.Prog(v.Op.Asm())
  1812  		p.From.Type = obj.TYPE_REG
  1813  		p.From.Reg = simdReg(v.Args[2])
  1814  		p.To.Type = obj.TYPE_MEM
  1815  		p.To.Reg = v.Args[0].Reg()
  1816  		ssagen.AddAux(&p.To, v)
  1817  		p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
  1818  
  1819  	case ssa.OpAMD64VPMOVMToVec8x16,
  1820  		ssa.OpAMD64VPMOVMToVec8x32,
  1821  		ssa.OpAMD64VPMOVMToVec8x64,
  1822  		ssa.OpAMD64VPMOVMToVec16x8,
  1823  		ssa.OpAMD64VPMOVMToVec16x16,
  1824  		ssa.OpAMD64VPMOVMToVec16x32,
  1825  		ssa.OpAMD64VPMOVMToVec32x4,
  1826  		ssa.OpAMD64VPMOVMToVec32x8,
  1827  		ssa.OpAMD64VPMOVMToVec32x16,
  1828  		ssa.OpAMD64VPMOVMToVec64x2,
  1829  		ssa.OpAMD64VPMOVMToVec64x4,
  1830  		ssa.OpAMD64VPMOVMToVec64x8:
  1831  		p := s.Prog(v.Op.Asm())
  1832  		p.From.Type = obj.TYPE_REG
  1833  		p.From.Reg = v.Args[0].Reg()
  1834  		p.To.Type = obj.TYPE_REG
  1835  		p.To.Reg = simdReg(v)
  1836  
  1837  	case ssa.OpAMD64VPMOVVec8x16ToM,
  1838  		ssa.OpAMD64VPMOVVec8x32ToM,
  1839  		ssa.OpAMD64VPMOVVec8x64ToM,
  1840  		ssa.OpAMD64VPMOVVec16x8ToM,
  1841  		ssa.OpAMD64VPMOVVec16x16ToM,
  1842  		ssa.OpAMD64VPMOVVec16x32ToM,
  1843  		ssa.OpAMD64VPMOVVec32x4ToM,
  1844  		ssa.OpAMD64VPMOVVec32x8ToM,
  1845  		ssa.OpAMD64VPMOVVec32x16ToM,
  1846  		ssa.OpAMD64VPMOVVec64x2ToM,
  1847  		ssa.OpAMD64VPMOVVec64x4ToM,
  1848  		ssa.OpAMD64VPMOVVec64x8ToM,
  1849  		ssa.OpAMD64VPMOVMSKB128,
  1850  		ssa.OpAMD64VPMOVMSKB256,
  1851  		ssa.OpAMD64VMOVMSKPS128,
  1852  		ssa.OpAMD64VMOVMSKPS256,
  1853  		ssa.OpAMD64VMOVMSKPD128,
  1854  		ssa.OpAMD64VMOVMSKPD256:
  1855  		p := s.Prog(v.Op.Asm())
  1856  		p.From.Type = obj.TYPE_REG
  1857  		p.From.Reg = simdReg(v.Args[0])
  1858  		p.To.Type = obj.TYPE_REG
  1859  		p.To.Reg = v.Reg()
  1860  
  1861  	case ssa.OpAMD64KMOVQk, ssa.OpAMD64KMOVDk, ssa.OpAMD64KMOVWk, ssa.OpAMD64KMOVBk,
  1862  		ssa.OpAMD64KMOVQi, ssa.OpAMD64KMOVDi, ssa.OpAMD64KMOVWi, ssa.OpAMD64KMOVBi:
  1863  		// See also ssa.OpAMD64KMOVQload
  1864  		p := s.Prog(v.Op.Asm())
  1865  		p.From.Type = obj.TYPE_REG
  1866  		p.From.Reg = v.Args[0].Reg()
  1867  		p.To.Type = obj.TYPE_REG
  1868  		p.To.Reg = v.Reg()
  1869  	case ssa.OpAMD64VPTEST:
  1870  		// Some instructions setting flags put their second operand into the destination reg.
  1871  		// See also CMP[BWDQ].
  1872  		p := s.Prog(v.Op.Asm())
  1873  		p.From.Type = obj.TYPE_REG
  1874  		p.From.Reg = simdReg(v.Args[0])
  1875  		p.To.Type = obj.TYPE_REG
  1876  		p.To.Reg = simdReg(v.Args[1])
  1877  
  1878  	default:
  1879  		if !ssaGenSIMDValue(s, v) {
  1880  			v.Fatalf("genValue not implemented: %s", v.LongString())
  1881  		}
  1882  	}
  1883  }
  1884  
  1885  // zeroX15 zeroes the X15 register.
  1886  func zeroX15(s *ssagen.State) {
  1887  	opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1888  }
  1889  
  1890  // Example instruction: VRSQRTPS X1, X1
  1891  func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1892  	p := s.Prog(v.Op.Asm())
  1893  	p.From.Type = obj.TYPE_REG
  1894  	p.From.Reg = simdReg(v.Args[0])
  1895  	p.To.Type = obj.TYPE_REG
  1896  	p.To.Reg = simdReg(v)
  1897  	return p
  1898  }
  1899  
  1900  // Example instruction: VPSUBD X1, X2, X3
  1901  func simdV21(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1902  	p := s.Prog(v.Op.Asm())
  1903  	p.From.Type = obj.TYPE_REG
  1904  	// Vector registers operands follows a right-to-left order.
  1905  	// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
  1906  	p.From.Reg = simdReg(v.Args[1])
  1907  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1908  	p.To.Type = obj.TYPE_REG
  1909  	p.To.Reg = simdReg(v)
  1910  	return p
  1911  }
  1912  
  1913  // This function is to accustomize the shifts.
  1914  // The 2nd arg is an XMM, and this function merely checks that.
  1915  // Example instruction: VPSLLQ Z1, X1, Z2
  1916  func simdVfpv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1917  	p := s.Prog(v.Op.Asm())
  1918  	p.From.Type = obj.TYPE_REG
  1919  	// Vector registers operands follows a right-to-left order.
  1920  	// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
  1921  	p.From.Reg = v.Args[1].Reg()
  1922  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1923  	p.To.Type = obj.TYPE_REG
  1924  	p.To.Reg = simdReg(v)
  1925  	return p
  1926  }
  1927  
  1928  // Example instruction: VPCMPEQW Z26, Z30, K4
  1929  func simdV2k(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1930  	p := s.Prog(v.Op.Asm())
  1931  	p.From.Type = obj.TYPE_REG
  1932  	p.From.Reg = simdReg(v.Args[1])
  1933  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1934  	p.To.Type = obj.TYPE_REG
  1935  	p.To.Reg = maskReg(v)
  1936  	return p
  1937  }
  1938  
  1939  // Example instruction: VPMINUQ X21, X3, K3, X31
  1940  func simdV2kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1941  	p := s.Prog(v.Op.Asm())
  1942  	p.From.Type = obj.TYPE_REG
  1943  	p.From.Reg = simdReg(v.Args[1])
  1944  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1945  	// These "simd*" series of functions assumes:
  1946  	// Any "K" register that serves as the write-mask
  1947  	// or "predicate" for "predicated AVX512 instructions"
  1948  	// sits right at the end of the operand list.
  1949  	// TODO: verify this assumption.
  1950  	p.AddRestSourceReg(maskReg(v.Args[2]))
  1951  	p.To.Type = obj.TYPE_REG
  1952  	p.To.Reg = simdReg(v)
  1953  	return p
  1954  }
  1955  
  1956  // Example instruction: VPABSB X1, X2, K3 (masking merging)
  1957  func simdV2kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1958  	p := s.Prog(v.Op.Asm())
  1959  	p.From.Type = obj.TYPE_REG
  1960  	p.From.Reg = simdReg(v.Args[1])
  1961  	// These "simd*" series of functions assumes:
  1962  	// Any "K" register that serves as the write-mask
  1963  	// or "predicate" for "predicated AVX512 instructions"
  1964  	// sits right at the end of the operand list.
  1965  	// TODO: verify this assumption.
  1966  	p.AddRestSourceReg(maskReg(v.Args[2]))
  1967  	p.To.Type = obj.TYPE_REG
  1968  	p.To.Reg = simdReg(v)
  1969  	return p
  1970  }
  1971  
  1972  // This function is to accustomize the shifts.
  1973  // The 2nd arg is an XMM, and this function merely checks that.
  1974  // Example instruction: VPSLLQ Z1, X1, K1, Z2
  1975  func simdVfpkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1976  	p := s.Prog(v.Op.Asm())
  1977  	p.From.Type = obj.TYPE_REG
  1978  	p.From.Reg = v.Args[1].Reg()
  1979  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1980  	p.AddRestSourceReg(maskReg(v.Args[2]))
  1981  	p.To.Type = obj.TYPE_REG
  1982  	p.To.Reg = simdReg(v)
  1983  	return p
  1984  }
  1985  
  1986  // Example instruction: VPCMPEQW Z26, Z30, K1, K4
  1987  func simdV2kk(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1988  	p := s.Prog(v.Op.Asm())
  1989  	p.From.Type = obj.TYPE_REG
  1990  	p.From.Reg = simdReg(v.Args[1])
  1991  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1992  	p.AddRestSourceReg(maskReg(v.Args[2]))
  1993  	p.To.Type = obj.TYPE_REG
  1994  	p.To.Reg = maskReg(v)
  1995  	return p
  1996  }
  1997  
  1998  // Example instruction: VPOPCNTB X14, K4, X16
  1999  func simdVkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2000  	p := s.Prog(v.Op.Asm())
  2001  	p.From.Type = obj.TYPE_REG
  2002  	p.From.Reg = simdReg(v.Args[0])
  2003  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2004  	p.To.Type = obj.TYPE_REG
  2005  	p.To.Reg = simdReg(v)
  2006  	return p
  2007  }
  2008  
  2009  // Example instruction: VROUNDPD $7, X2, X2
  2010  func simdV11Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2011  	p := s.Prog(v.Op.Asm())
  2012  	p.From.Offset = int64(v.AuxUInt8())
  2013  	p.From.Type = obj.TYPE_CONST
  2014  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2015  	p.To.Type = obj.TYPE_REG
  2016  	p.To.Reg = simdReg(v)
  2017  	return p
  2018  }
  2019  
  2020  // Example instruction: VREDUCEPD $126, X1, K3, X31
  2021  func simdVkvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2022  	p := s.Prog(v.Op.Asm())
  2023  	p.From.Offset = int64(v.AuxUInt8())
  2024  	p.From.Type = obj.TYPE_CONST
  2025  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2026  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2027  	p.To.Type = obj.TYPE_REG
  2028  	p.To.Reg = simdReg(v)
  2029  	return p
  2030  }
  2031  
  2032  // Example instruction: VCMPPS $7, X2, X9, X2
  2033  func simdV21Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2034  	p := s.Prog(v.Op.Asm())
  2035  	p.From.Offset = int64(v.AuxUInt8())
  2036  	p.From.Type = obj.TYPE_CONST
  2037  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2038  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2039  	p.To.Type = obj.TYPE_REG
  2040  	p.To.Reg = simdReg(v)
  2041  	return p
  2042  }
  2043  
  2044  // Example instruction: VPINSRB $3, DX, X0, X0
  2045  func simdVgpvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2046  	p := s.Prog(v.Op.Asm())
  2047  	p.From.Offset = int64(v.AuxUInt8())
  2048  	p.From.Type = obj.TYPE_CONST
  2049  	p.AddRestSourceReg(v.Args[1].Reg())
  2050  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2051  	p.To.Type = obj.TYPE_REG
  2052  	p.To.Reg = simdReg(v)
  2053  	return p
  2054  }
  2055  
  2056  // Example instruction: VPCMPD $1, Z1, Z2, K1
  2057  func simdV2kImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2058  	p := s.Prog(v.Op.Asm())
  2059  	p.From.Offset = int64(v.AuxUInt8())
  2060  	p.From.Type = obj.TYPE_CONST
  2061  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2062  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2063  	p.To.Type = obj.TYPE_REG
  2064  	p.To.Reg = maskReg(v)
  2065  	return p
  2066  }
  2067  
  2068  // Example instruction: VPCMPD $1, Z1, Z2, K2, K1
  2069  func simdV2kkImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2070  	p := s.Prog(v.Op.Asm())
  2071  	p.From.Offset = int64(v.AuxUInt8())
  2072  	p.From.Type = obj.TYPE_CONST
  2073  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2074  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2075  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2076  	p.To.Type = obj.TYPE_REG
  2077  	p.To.Reg = maskReg(v)
  2078  	return p
  2079  }
  2080  
  2081  func simdV2kvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2082  	p := s.Prog(v.Op.Asm())
  2083  	p.From.Offset = int64(v.AuxUInt8())
  2084  	p.From.Type = obj.TYPE_CONST
  2085  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2086  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2087  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2088  	p.To.Type = obj.TYPE_REG
  2089  	p.To.Reg = simdReg(v)
  2090  	return p
  2091  }
  2092  
  2093  // Example instruction: VFMADD213PD Z2, Z1, Z0
  2094  func simdV31ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2095  	p := s.Prog(v.Op.Asm())
  2096  	p.From.Type = obj.TYPE_REG
  2097  	p.From.Reg = simdReg(v.Args[2])
  2098  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2099  	p.To.Type = obj.TYPE_REG
  2100  	p.To.Reg = simdReg(v)
  2101  	return p
  2102  }
  2103  
  2104  func simdV31ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2105  	p := s.Prog(v.Op.Asm())
  2106  	p.From.Offset = int64(v.AuxUInt8())
  2107  	p.From.Type = obj.TYPE_CONST
  2108  
  2109  	p.AddRestSourceReg(simdReg(v.Args[2]))
  2110  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2111  	// p.AddRestSourceReg(x86.REG_K0)
  2112  	p.To.Type = obj.TYPE_REG
  2113  	p.To.Reg = simdReg(v)
  2114  	return p
  2115  }
  2116  
  2117  // v31loadResultInArg0Imm8
  2118  // Example instruction:
  2119  // for (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
  2120  func simdV31loadResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2121  	sc := v.AuxValAndOff()
  2122  	p := s.Prog(v.Op.Asm())
  2123  
  2124  	p.From.Type = obj.TYPE_CONST
  2125  	p.From.Offset = sc.Val64()
  2126  
  2127  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[2].Reg()}
  2128  	ssagen.AddAux2(&m, v, sc.Off64())
  2129  	p.AddRestSource(m)
  2130  
  2131  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2132  	return p
  2133  }
  2134  
  2135  // Example instruction: VFMADD213PD Z2, Z1, K1, Z0
  2136  func simdV3kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2137  	p := s.Prog(v.Op.Asm())
  2138  	p.From.Type = obj.TYPE_REG
  2139  	p.From.Reg = simdReg(v.Args[2])
  2140  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2141  	p.AddRestSourceReg(maskReg(v.Args[3]))
  2142  	p.To.Type = obj.TYPE_REG
  2143  	p.To.Reg = simdReg(v)
  2144  	return p
  2145  }
  2146  
  2147  func simdVgpImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2148  	p := s.Prog(v.Op.Asm())
  2149  	p.From.Offset = int64(v.AuxUInt8())
  2150  	p.From.Type = obj.TYPE_CONST
  2151  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2152  	p.To.Type = obj.TYPE_REG
  2153  	p.To.Reg = v.Reg()
  2154  	return p
  2155  }
  2156  
  2157  // Currently unused
  2158  func simdV31(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2159  	p := s.Prog(v.Op.Asm())
  2160  	p.From.Type = obj.TYPE_REG
  2161  	p.From.Reg = simdReg(v.Args[2])
  2162  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2163  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2164  	p.To.Type = obj.TYPE_REG
  2165  	p.To.Reg = simdReg(v)
  2166  	return p
  2167  }
  2168  
  2169  // Currently unused
  2170  func simdV3kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2171  	p := s.Prog(v.Op.Asm())
  2172  	p.From.Type = obj.TYPE_REG
  2173  	p.From.Reg = simdReg(v.Args[2])
  2174  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2175  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2176  	p.AddRestSourceReg(maskReg(v.Args[3]))
  2177  	p.To.Type = obj.TYPE_REG
  2178  	p.To.Reg = simdReg(v)
  2179  	return p
  2180  }
  2181  
  2182  // Example instruction: VRCP14PS (DI), K6, X22
  2183  func simdVkvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2184  	p := s.Prog(v.Op.Asm())
  2185  	p.From.Type = obj.TYPE_MEM
  2186  	p.From.Reg = v.Args[0].Reg()
  2187  	ssagen.AddAux(&p.From, v)
  2188  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2189  	p.To.Type = obj.TYPE_REG
  2190  	p.To.Reg = simdReg(v)
  2191  	return p
  2192  }
  2193  
  2194  // Example instruction: VPSLLVD (DX), X7, X18
  2195  func simdV21load(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2196  	p := s.Prog(v.Op.Asm())
  2197  	p.From.Type = obj.TYPE_MEM
  2198  	p.From.Reg = v.Args[1].Reg()
  2199  	ssagen.AddAux(&p.From, v)
  2200  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2201  	p.To.Type = obj.TYPE_REG
  2202  	p.To.Reg = simdReg(v)
  2203  	return p
  2204  }
  2205  
  2206  // Example instruction: VPDPWSSD (SI), X24, X18
  2207  func simdV31loadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2208  	p := s.Prog(v.Op.Asm())
  2209  	p.From.Type = obj.TYPE_MEM
  2210  	p.From.Reg = v.Args[2].Reg()
  2211  	ssagen.AddAux(&p.From, v)
  2212  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2213  	p.To.Type = obj.TYPE_REG
  2214  	p.To.Reg = simdReg(v)
  2215  	return p
  2216  }
  2217  
  2218  // Example instruction: VPDPWSSD (SI), X24, K1, X18
  2219  func simdV3kvloadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2220  	p := s.Prog(v.Op.Asm())
  2221  	p.From.Type = obj.TYPE_MEM
  2222  	p.From.Reg = v.Args[2].Reg()
  2223  	ssagen.AddAux(&p.From, v)
  2224  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2225  	p.AddRestSourceReg(maskReg(v.Args[3]))
  2226  	p.To.Type = obj.TYPE_REG
  2227  	p.To.Reg = simdReg(v)
  2228  	return p
  2229  }
  2230  
  2231  // Example instruction: VPSLLVD (SI), X1, K1, X2
  2232  func simdV2kvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2233  	p := s.Prog(v.Op.Asm())
  2234  	p.From.Type = obj.TYPE_MEM
  2235  	p.From.Reg = v.Args[1].Reg()
  2236  	ssagen.AddAux(&p.From, v)
  2237  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2238  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2239  	p.To.Type = obj.TYPE_REG
  2240  	p.To.Reg = simdReg(v)
  2241  	return p
  2242  }
  2243  
  2244  // Example instruction: VPCMPEQD (SI), X1, K1
  2245  func simdV2kload(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2246  	p := s.Prog(v.Op.Asm())
  2247  	p.From.Type = obj.TYPE_MEM
  2248  	p.From.Reg = v.Args[1].Reg()
  2249  	ssagen.AddAux(&p.From, v)
  2250  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2251  	p.To.Type = obj.TYPE_REG
  2252  	p.To.Reg = maskReg(v)
  2253  	return p
  2254  }
  2255  
  2256  // Example instruction: VCVTTPS2DQ (BX), X2
  2257  func simdV11load(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2258  	p := s.Prog(v.Op.Asm())
  2259  	p.From.Type = obj.TYPE_MEM
  2260  	p.From.Reg = v.Args[0].Reg()
  2261  	ssagen.AddAux(&p.From, v)
  2262  	p.To.Type = obj.TYPE_REG
  2263  	p.To.Reg = simdReg(v)
  2264  	return p
  2265  }
  2266  
  2267  // Example instruction: VPSHUFD $7, (BX), X11
  2268  func simdV11loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2269  	sc := v.AuxValAndOff()
  2270  	p := s.Prog(v.Op.Asm())
  2271  	p.From.Type = obj.TYPE_CONST
  2272  	p.From.Offset = sc.Val64()
  2273  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
  2274  	ssagen.AddAux2(&m, v, sc.Off64())
  2275  	p.AddRestSource(m)
  2276  	p.To.Type = obj.TYPE_REG
  2277  	p.To.Reg = simdReg(v)
  2278  	return p
  2279  }
  2280  
  2281  // Example instruction: VPRORD $81, -15(R14), K7, Y1
  2282  func simdVkvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2283  	sc := v.AuxValAndOff()
  2284  	p := s.Prog(v.Op.Asm())
  2285  	p.From.Type = obj.TYPE_CONST
  2286  	p.From.Offset = sc.Val64()
  2287  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
  2288  	ssagen.AddAux2(&m, v, sc.Off64())
  2289  	p.AddRestSource(m)
  2290  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2291  	p.To.Type = obj.TYPE_REG
  2292  	p.To.Reg = simdReg(v)
  2293  	return p
  2294  }
  2295  
  2296  // Example instruction: VPSHLDD $82, 7(SI), Y21, Y3
  2297  func simdV21loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2298  	sc := v.AuxValAndOff()
  2299  	p := s.Prog(v.Op.Asm())
  2300  	p.From.Type = obj.TYPE_CONST
  2301  	p.From.Offset = sc.Val64()
  2302  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2303  	ssagen.AddAux2(&m, v, sc.Off64())
  2304  	p.AddRestSource(m)
  2305  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2306  	p.To.Type = obj.TYPE_REG
  2307  	p.To.Reg = simdReg(v)
  2308  	return p
  2309  }
  2310  
  2311  // Example instruction: VCMPPS $81, -7(DI), Y16, K3
  2312  func simdV2kloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2313  	sc := v.AuxValAndOff()
  2314  	p := s.Prog(v.Op.Asm())
  2315  	p.From.Type = obj.TYPE_CONST
  2316  	p.From.Offset = sc.Val64()
  2317  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2318  	ssagen.AddAux2(&m, v, sc.Off64())
  2319  	p.AddRestSource(m)
  2320  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2321  	p.To.Type = obj.TYPE_REG
  2322  	p.To.Reg = maskReg(v)
  2323  	return p
  2324  }
  2325  
  2326  // Example instruction: VCMPPS $81, -7(DI), Y16, K1, K3
  2327  func simdV2kkloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2328  	sc := v.AuxValAndOff()
  2329  	p := s.Prog(v.Op.Asm())
  2330  	p.From.Type = obj.TYPE_CONST
  2331  	p.From.Offset = sc.Val64()
  2332  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2333  	ssagen.AddAux2(&m, v, sc.Off64())
  2334  	p.AddRestSource(m)
  2335  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2336  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2337  	p.To.Type = obj.TYPE_REG
  2338  	p.To.Reg = maskReg(v)
  2339  	return p
  2340  }
  2341  
  2342  // Example instruction: VGF2P8AFFINEINVQB $64, -17(BP), X31, K3, X26
  2343  func simdV2kvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2344  	sc := v.AuxValAndOff()
  2345  	p := s.Prog(v.Op.Asm())
  2346  	p.From.Type = obj.TYPE_CONST
  2347  	p.From.Offset = sc.Val64()
  2348  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2349  	ssagen.AddAux2(&m, v, sc.Off64())
  2350  	p.AddRestSource(m)
  2351  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2352  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2353  	p.To.Type = obj.TYPE_REG
  2354  	p.To.Reg = simdReg(v)
  2355  	return p
  2356  }
  2357  
  2358  // Example instruction: SHA1NEXTE X2, X2
  2359  func simdV21ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2360  	p := s.Prog(v.Op.Asm())
  2361  	p.From.Type = obj.TYPE_REG
  2362  	p.From.Reg = simdReg(v.Args[1])
  2363  	p.To.Type = obj.TYPE_REG
  2364  	p.To.Reg = simdReg(v)
  2365  	return p
  2366  }
  2367  
  2368  // Example instruction: SHA1RNDS4 $1, X2, X2
  2369  func simdV21ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2370  	p := s.Prog(v.Op.Asm())
  2371  	p.From.Offset = int64(v.AuxUInt8())
  2372  	p.From.Type = obj.TYPE_CONST
  2373  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2374  	p.To.Type = obj.TYPE_REG
  2375  	p.To.Reg = simdReg(v)
  2376  	return p
  2377  }
  2378  
  2379  // Example instruction: SHA256RNDS2 X0, X11, X2
  2380  func simdV31x0AtIn2ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2381  	return simdV31ResultInArg0(s, v)
  2382  }
  2383  
  2384  var blockJump = [...]struct {
  2385  	asm, invasm obj.As
  2386  }{
  2387  	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
  2388  	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
  2389  	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
  2390  	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
  2391  	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
  2392  	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
  2393  	ssa.BlockAMD64OS:  {x86.AJOS, x86.AJOC},
  2394  	ssa.BlockAMD64OC:  {x86.AJOC, x86.AJOS},
  2395  	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
  2396  	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
  2397  	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
  2398  	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
  2399  	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
  2400  	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
  2401  }
  2402  
  2403  var eqfJumps = [2][2]ssagen.IndexJump{
  2404  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
  2405  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
  2406  }
  2407  var nefJumps = [2][2]ssagen.IndexJump{
  2408  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
  2409  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
  2410  }
  2411  
  2412  func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
  2413  	switch b.Kind {
  2414  	case ssa.BlockPlain, ssa.BlockDefer:
  2415  		if b.Succs[0].Block() != next {
  2416  			p := s.Prog(obj.AJMP)
  2417  			p.To.Type = obj.TYPE_BRANCH
  2418  			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
  2419  		}
  2420  	case ssa.BlockExit, ssa.BlockRetJmp:
  2421  	case ssa.BlockRet:
  2422  		s.Prog(obj.ARET)
  2423  
  2424  	case ssa.BlockAMD64EQF:
  2425  		s.CombJump(b, next, &eqfJumps)
  2426  
  2427  	case ssa.BlockAMD64NEF:
  2428  		s.CombJump(b, next, &nefJumps)
  2429  
  2430  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  2431  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  2432  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  2433  		ssa.BlockAMD64OS, ssa.BlockAMD64OC,
  2434  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  2435  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  2436  		jmp := blockJump[b.Kind]
  2437  		switch next {
  2438  		case b.Succs[0].Block():
  2439  			s.Br(jmp.invasm, b.Succs[1].Block())
  2440  		case b.Succs[1].Block():
  2441  			s.Br(jmp.asm, b.Succs[0].Block())
  2442  		default:
  2443  			if b.Likely != ssa.BranchUnlikely {
  2444  				s.Br(jmp.asm, b.Succs[0].Block())
  2445  				s.Br(obj.AJMP, b.Succs[1].Block())
  2446  			} else {
  2447  				s.Br(jmp.invasm, b.Succs[1].Block())
  2448  				s.Br(obj.AJMP, b.Succs[0].Block())
  2449  			}
  2450  		}
  2451  
  2452  	case ssa.BlockAMD64JUMPTABLE:
  2453  		// JMP      *(TABLE)(INDEX*8)
  2454  		p := s.Prog(obj.AJMP)
  2455  		p.To.Type = obj.TYPE_MEM
  2456  		p.To.Reg = b.Controls[1].Reg()
  2457  		p.To.Index = b.Controls[0].Reg()
  2458  		p.To.Scale = 8
  2459  		// Save jump tables for later resolution of the target blocks.
  2460  		s.JumpTables = append(s.JumpTables, b)
  2461  
  2462  	default:
  2463  		b.Fatalf("branch not implemented: %s", b.LongString())
  2464  	}
  2465  }
  2466  
  2467  func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  2468  	p := s.Prog(loadByRegWidth(reg, t.Size()))
  2469  	p.From.Type = obj.TYPE_MEM
  2470  	p.From.Name = obj.NAME_AUTO
  2471  	p.From.Sym = n.Linksym()
  2472  	p.From.Offset = n.FrameOffset() + off
  2473  	p.To.Type = obj.TYPE_REG
  2474  	p.To.Reg = reg
  2475  	return p
  2476  }
  2477  
  2478  func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  2479  	p = pp.Append(p, storeByRegWidth(reg, t.Size()), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
  2480  	p.To.Name = obj.NAME_PARAM
  2481  	p.To.Sym = n.Linksym()
  2482  	p.Pos = p.Pos.WithNotStmt()
  2483  	return p
  2484  }
  2485  
  2486  // zero 16 bytes at reg+off.
  2487  func zero16(s *ssagen.State, reg int16, off int64) {
  2488  	//   MOVUPS  X15, off(ptrReg)
  2489  	p := s.Prog(x86.AMOVUPS)
  2490  	p.From.Type = obj.TYPE_REG
  2491  	p.From.Reg = x86.REG_X15
  2492  	p.To.Type = obj.TYPE_MEM
  2493  	p.To.Reg = reg
  2494  	p.To.Offset = off
  2495  }
  2496  
  2497  // move 16 bytes from src+off to dst+off using temporary register tmp.
  2498  func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
  2499  	//   MOVUPS  off(srcReg), tmpReg
  2500  	//   MOVUPS  tmpReg, off(dstReg)
  2501  	p := s.Prog(x86.AMOVUPS)
  2502  	p.From.Type = obj.TYPE_MEM
  2503  	p.From.Reg = src
  2504  	p.From.Offset = off
  2505  	p.To.Type = obj.TYPE_REG
  2506  	p.To.Reg = tmp
  2507  	p = s.Prog(x86.AMOVUPS)
  2508  	p.From.Type = obj.TYPE_REG
  2509  	p.From.Reg = tmp
  2510  	p.To.Type = obj.TYPE_MEM
  2511  	p.To.Reg = dst
  2512  	p.To.Offset = off
  2513  }
  2514  
  2515  // XXX maybe make this part of v.Reg?
  2516  // On the other hand, it is architecture-specific.
  2517  func simdReg(v *ssa.Value) int16 {
  2518  	t := v.Type
  2519  	if !t.IsSIMD() {
  2520  		base.Fatalf("simdReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
  2521  	}
  2522  	return simdRegBySize(v.Reg(), t.Size())
  2523  }
  2524  
  2525  func simdRegBySize(reg int16, size int64) int16 {
  2526  	switch size {
  2527  	case 16:
  2528  		return reg
  2529  	case 32:
  2530  		return reg + (x86.REG_Y0 - x86.REG_X0)
  2531  	case 64:
  2532  		return reg + (x86.REG_Z0 - x86.REG_X0)
  2533  	}
  2534  	panic("simdRegBySize: bad size")
  2535  }
  2536  
  2537  // XXX k mask
  2538  func maskReg(v *ssa.Value) int16 {
  2539  	t := v.Type
  2540  	if !t.IsSIMD() {
  2541  		base.Fatalf("maskReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
  2542  	}
  2543  	switch t.Size() {
  2544  	case 8:
  2545  		return v.Reg()
  2546  	}
  2547  	panic("unreachable")
  2548  }
  2549  
  2550  // XXX k mask + vec
  2551  func simdOrMaskReg(v *ssa.Value) int16 {
  2552  	t := v.Type
  2553  	if t.Size() <= 8 {
  2554  		return maskReg(v)
  2555  	}
  2556  	return simdReg(v)
  2557  }
  2558  
  2559  // XXX this is used for shift operations only.
  2560  // regalloc will issue OpCopy with incorrect type, but the assigned
  2561  // register should be correct, and this function is merely checking
  2562  // the sanity of this part.
  2563  func simdCheckRegOnly(v *ssa.Value, regStart, regEnd int16) int16 {
  2564  	if v.Reg() > regEnd || v.Reg() < regStart {
  2565  		panic("simdCheckRegOnly: not the desired register")
  2566  	}
  2567  	return v.Reg()
  2568  }
  2569  

View as plain text