Source file src/cmd/compile/internal/amd64/ssa.go

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"cmd/compile/internal/base"
    12  	"cmd/compile/internal/ir"
    13  	"cmd/compile/internal/logopt"
    14  	"cmd/compile/internal/objw"
    15  	"cmd/compile/internal/ssa"
    16  	"cmd/compile/internal/ssagen"
    17  	"cmd/compile/internal/types"
    18  	"cmd/internal/obj"
    19  	"cmd/internal/obj/x86"
    20  	"internal/abi"
    21  )
    22  
    23  // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
    24  func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
    25  	flive := b.FlagsLiveAtEnd
    26  	for _, c := range b.ControlValues() {
    27  		flive = c.Type.IsFlags() || flive
    28  	}
    29  	for i := len(b.Values) - 1; i >= 0; i-- {
    30  		v := b.Values[i]
    31  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    32  			// The "mark" is any non-nil Aux value.
    33  			v.Aux = ssa.AuxMark
    34  		}
    35  		if v.Type.IsFlags() {
    36  			flive = false
    37  		}
    38  		for _, a := range v.Args {
    39  			if a.Type.IsFlags() {
    40  				flive = true
    41  			}
    42  		}
    43  	}
    44  }
    45  
    46  // loadByType returns the load instruction of the given type.
    47  func loadByType(t *types.Type) obj.As {
    48  	// Avoid partial register write
    49  	if !t.IsFloat() {
    50  		switch t.Size() {
    51  		case 1:
    52  			return x86.AMOVBLZX
    53  		case 2:
    54  			return x86.AMOVWLZX
    55  		}
    56  	}
    57  	// Otherwise, there's no difference between load and store opcodes.
    58  	return storeByType(t)
    59  }
    60  
    61  // storeByType returns the store instruction of the given type.
    62  func storeByType(t *types.Type) obj.As {
    63  	width := t.Size()
    64  	if t.IsFloat() {
    65  		switch width {
    66  		case 4:
    67  			return x86.AMOVSS
    68  		case 8:
    69  			return x86.AMOVSD
    70  		}
    71  	} else {
    72  		switch width {
    73  		case 1:
    74  			return x86.AMOVB
    75  		case 2:
    76  			return x86.AMOVW
    77  		case 4:
    78  			return x86.AMOVL
    79  		case 8:
    80  			return x86.AMOVQ
    81  		case 16:
    82  			return x86.AMOVUPS
    83  		}
    84  	}
    85  	panic(fmt.Sprintf("bad store type %v", t))
    86  }
    87  
    88  // moveByType returns the reg->reg move instruction of the given type.
    89  func moveByType(t *types.Type) obj.As {
    90  	if t.IsFloat() {
    91  		// Moving the whole sse2 register is faster
    92  		// than moving just the correct low portion of it.
    93  		// There is no xmm->xmm move with 1 byte opcode,
    94  		// so use movups, which has 2 byte opcode.
    95  		return x86.AMOVUPS
    96  	} else {
    97  		switch t.Size() {
    98  		case 1:
    99  			// Avoids partial register write
   100  			return x86.AMOVL
   101  		case 2:
   102  			return x86.AMOVL
   103  		case 4:
   104  			return x86.AMOVL
   105  		case 8:
   106  			return x86.AMOVQ
   107  		case 16:
   108  			return x86.AMOVUPS // int128s are in SSE registers
   109  		default:
   110  			panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
   111  		}
   112  	}
   113  }
   114  
   115  // opregreg emits instructions for
   116  //
   117  //	dest := dest(To) op src(From)
   118  //
   119  // and also returns the created obj.Prog so it
   120  // may be further adjusted (offset, scale, etc).
   121  func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog {
   122  	p := s.Prog(op)
   123  	p.From.Type = obj.TYPE_REG
   124  	p.To.Type = obj.TYPE_REG
   125  	p.To.Reg = dest
   126  	p.From.Reg = src
   127  	return p
   128  }
   129  
   130  // memIdx fills out a as an indexed memory reference for v.
   131  // It assumes that the base register and the index register
   132  // are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
   133  // The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
   134  func memIdx(a *obj.Addr, v *ssa.Value) {
   135  	r, i := v.Args[0].Reg(), v.Args[1].Reg()
   136  	a.Type = obj.TYPE_MEM
   137  	a.Scale = v.Op.Scale()
   138  	if a.Scale == 1 && i == x86.REG_SP {
   139  		r, i = i, r
   140  	}
   141  	a.Reg = r
   142  	a.Index = i
   143  }
   144  
   145  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
   146  // See runtime/mkduff.go.
   147  func duffStart(size int64) int64 {
   148  	x, _ := duff(size)
   149  	return x
   150  }
   151  func duffAdj(size int64) int64 {
   152  	_, x := duff(size)
   153  	return x
   154  }
   155  
   156  // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
   157  // required to use the duffzero mechanism for a block of the given size.
   158  func duff(size int64) (int64, int64) {
   159  	if size < 32 || size > 1024 || size%dzClearStep != 0 {
   160  		panic("bad duffzero size")
   161  	}
   162  	steps := size / dzClearStep
   163  	blocks := steps / dzBlockLen
   164  	steps %= dzBlockLen
   165  	off := dzBlockSize * (dzBlocks - blocks)
   166  	var adj int64
   167  	if steps != 0 {
   168  		off -= dzLeaqSize
   169  		off -= dzMovSize * steps
   170  		adj -= dzClearStep * (dzBlockLen - steps)
   171  	}
   172  	return off, adj
   173  }
   174  
   175  func getgFromTLS(s *ssagen.State, r int16) {
   176  	// See the comments in cmd/internal/obj/x86/obj6.go
   177  	// near CanUse1InsnTLS for a detailed explanation of these instructions.
   178  	if x86.CanUse1InsnTLS(base.Ctxt) {
   179  		// MOVQ (TLS), r
   180  		p := s.Prog(x86.AMOVQ)
   181  		p.From.Type = obj.TYPE_MEM
   182  		p.From.Reg = x86.REG_TLS
   183  		p.To.Type = obj.TYPE_REG
   184  		p.To.Reg = r
   185  	} else {
   186  		// MOVQ TLS, r
   187  		// MOVQ (r)(TLS*1), r
   188  		p := s.Prog(x86.AMOVQ)
   189  		p.From.Type = obj.TYPE_REG
   190  		p.From.Reg = x86.REG_TLS
   191  		p.To.Type = obj.TYPE_REG
   192  		p.To.Reg = r
   193  		q := s.Prog(x86.AMOVQ)
   194  		q.From.Type = obj.TYPE_MEM
   195  		q.From.Reg = r
   196  		q.From.Index = x86.REG_TLS
   197  		q.From.Scale = 1
   198  		q.To.Type = obj.TYPE_REG
   199  		q.To.Reg = r
   200  	}
   201  }
   202  
   203  func ssaGenValue(s *ssagen.State, v *ssa.Value) {
   204  	switch v.Op {
   205  	case ssa.OpAMD64VFMADD231SD, ssa.OpAMD64VFMADD231SS:
   206  		p := s.Prog(v.Op.Asm())
   207  		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
   208  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   209  		p.AddRestSourceReg(v.Args[1].Reg())
   210  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   211  		r := v.Reg()
   212  		r1 := v.Args[0].Reg()
   213  		r2 := v.Args[1].Reg()
   214  		switch {
   215  		case r == r1:
   216  			p := s.Prog(v.Op.Asm())
   217  			p.From.Type = obj.TYPE_REG
   218  			p.From.Reg = r2
   219  			p.To.Type = obj.TYPE_REG
   220  			p.To.Reg = r
   221  		case r == r2:
   222  			p := s.Prog(v.Op.Asm())
   223  			p.From.Type = obj.TYPE_REG
   224  			p.From.Reg = r1
   225  			p.To.Type = obj.TYPE_REG
   226  			p.To.Reg = r
   227  		default:
   228  			var asm obj.As
   229  			if v.Op == ssa.OpAMD64ADDQ {
   230  				asm = x86.ALEAQ
   231  			} else {
   232  				asm = x86.ALEAL
   233  			}
   234  			p := s.Prog(asm)
   235  			p.From.Type = obj.TYPE_MEM
   236  			p.From.Reg = r1
   237  			p.From.Scale = 1
   238  			p.From.Index = r2
   239  			p.To.Type = obj.TYPE_REG
   240  			p.To.Reg = r
   241  		}
   242  	// 2-address opcode arithmetic
   243  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   244  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   245  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   246  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   247  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   248  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   249  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   250  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   251  		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
   252  		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
   253  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   254  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   255  		ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
   256  		ssa.OpAMD64POR, ssa.OpAMD64PXOR,
   257  		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
   258  		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
   259  		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
   260  		ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
   261  		ssa.OpAMD64PUNPCKLBW:
   262  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   263  
   264  	case ssa.OpAMD64PSHUFLW:
   265  		p := s.Prog(v.Op.Asm())
   266  		imm := v.AuxInt
   267  		if imm < 0 || imm > 255 {
   268  			v.Fatalf("Invalid source selection immediate")
   269  		}
   270  		p.From.Offset = imm
   271  		p.From.Type = obj.TYPE_CONST
   272  		p.AddRestSourceReg(v.Args[0].Reg())
   273  		p.To.Type = obj.TYPE_REG
   274  		p.To.Reg = v.Reg()
   275  
   276  	case ssa.OpAMD64PSHUFBbroadcast:
   277  		// PSHUFB with a control mask of zero copies byte 0 to all
   278  		// bytes in the register.
   279  		//
   280  		// X15 is always zero with ABIInternal.
   281  		if s.ABI != obj.ABIInternal {
   282  			// zero X15 manually
   283  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   284  		}
   285  
   286  		p := s.Prog(v.Op.Asm())
   287  		p.From.Type = obj.TYPE_REG
   288  		p.To.Type = obj.TYPE_REG
   289  		p.To.Reg = v.Reg()
   290  		p.From.Reg = x86.REG_X15
   291  
   292  	case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
   293  		p := s.Prog(v.Op.Asm())
   294  		lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
   295  		p.From.Type = obj.TYPE_REG
   296  		p.From.Reg = bits
   297  		p.To.Type = obj.TYPE_REG
   298  		p.To.Reg = lo
   299  		p.AddRestSourceReg(hi)
   300  
   301  	case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
   302  		ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
   303  		ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   304  		p := s.Prog(v.Op.Asm())
   305  		p.From.Type = obj.TYPE_REG
   306  		p.From.Reg = v.Args[0].Reg()
   307  		p.To.Type = obj.TYPE_REG
   308  		switch v.Op {
   309  		case ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   310  			p.To.Reg = v.Reg0()
   311  		default:
   312  			p.To.Reg = v.Reg()
   313  		}
   314  
   315  	case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
   316  		p := s.Prog(v.Op.Asm())
   317  		p.From.Type = obj.TYPE_REG
   318  		p.From.Reg = v.Args[0].Reg()
   319  		p.To.Type = obj.TYPE_REG
   320  		p.To.Reg = v.Reg()
   321  		p.AddRestSourceReg(v.Args[1].Reg())
   322  
   323  	case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ,
   324  		ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ,
   325  		ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ:
   326  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   327  		p.AddRestSourceReg(v.Args[0].Reg())
   328  
   329  	case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
   330  		ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload,
   331  		ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload:
   332  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   333  		m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
   334  		ssagen.AddAux(&m, v)
   335  		p.AddRestSource(m)
   336  
   337  	case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8,
   338  		ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8,
   339  		ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8,
   340  		ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8,
   341  		ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8,
   342  		ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8:
   343  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg())
   344  		m := obj.Addr{Type: obj.TYPE_MEM}
   345  		memIdx(&m, v)
   346  		ssagen.AddAux(&m, v)
   347  		p.AddRestSource(m)
   348  
   349  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   350  		// Arg[0] (the dividend) is in AX.
   351  		// Arg[1] (the divisor) can be in any other register.
   352  		// Result[0] (the quotient) is in AX.
   353  		// Result[1] (the remainder) is in DX.
   354  		r := v.Args[1].Reg()
   355  
   356  		// Zero extend dividend.
   357  		opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   358  
   359  		// Issue divide.
   360  		p := s.Prog(v.Op.Asm())
   361  		p.From.Type = obj.TYPE_REG
   362  		p.From.Reg = r
   363  
   364  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   365  		// Arg[0] (the dividend) is in AX.
   366  		// Arg[1] (the divisor) can be in any other register.
   367  		// Result[0] (the quotient) is in AX.
   368  		// Result[1] (the remainder) is in DX.
   369  		r := v.Args[1].Reg()
   370  
   371  		var opCMP, opNEG, opSXD obj.As
   372  		switch v.Op {
   373  		case ssa.OpAMD64DIVQ:
   374  			opCMP, opNEG, opSXD = x86.ACMPQ, x86.ANEGQ, x86.ACQO
   375  		case ssa.OpAMD64DIVL:
   376  			opCMP, opNEG, opSXD = x86.ACMPL, x86.ANEGL, x86.ACDQ
   377  		case ssa.OpAMD64DIVW:
   378  			opCMP, opNEG, opSXD = x86.ACMPW, x86.ANEGW, x86.ACWD
   379  		}
   380  
   381  		// CPU faults upon signed overflow, which occurs when the most
   382  		// negative int is divided by -1. Handle divide by -1 as a special case.
   383  		var j1, j2 *obj.Prog
   384  		if ssa.DivisionNeedsFixUp(v) {
   385  			c := s.Prog(opCMP)
   386  			c.From.Type = obj.TYPE_REG
   387  			c.From.Reg = r
   388  			c.To.Type = obj.TYPE_CONST
   389  			c.To.Offset = -1
   390  
   391  			// Divisor is not -1, proceed with normal division.
   392  			j1 = s.Prog(x86.AJNE)
   393  			j1.To.Type = obj.TYPE_BRANCH
   394  
   395  			// Divisor is -1, manually compute quotient and remainder via fixup code.
   396  			// n / -1 = -n
   397  			n1 := s.Prog(opNEG)
   398  			n1.To.Type = obj.TYPE_REG
   399  			n1.To.Reg = x86.REG_AX
   400  
   401  			// n % -1 == 0
   402  			opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   403  
   404  			// TODO(khr): issue only the -1 fixup code we need.
   405  			// For instance, if only the quotient is used, no point in zeroing the remainder.
   406  
   407  			// Skip over normal division.
   408  			j2 = s.Prog(obj.AJMP)
   409  			j2.To.Type = obj.TYPE_BRANCH
   410  		}
   411  
   412  		// Sign extend dividend and perform division.
   413  		p := s.Prog(opSXD)
   414  		if j1 != nil {
   415  			j1.To.SetTarget(p)
   416  		}
   417  		p = s.Prog(v.Op.Asm())
   418  		p.From.Type = obj.TYPE_REG
   419  		p.From.Reg = r
   420  
   421  		if j2 != nil {
   422  			j2.To.SetTarget(s.Pc())
   423  		}
   424  
   425  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
   426  		// the frontend rewrites constant division by 8/16/32 bit integers into
   427  		// HMUL by a constant
   428  		// SSA rewrites generate the 64 bit versions
   429  
   430  		// Arg[0] is already in AX as it's the only register we allow
   431  		// and DX is the only output we care about (the high bits)
   432  		p := s.Prog(v.Op.Asm())
   433  		p.From.Type = obj.TYPE_REG
   434  		p.From.Reg = v.Args[1].Reg()
   435  
   436  		// IMULB puts the high portion in AH instead of DL,
   437  		// so move it to DL for consistency
   438  		if v.Type.Size() == 1 {
   439  			m := s.Prog(x86.AMOVB)
   440  			m.From.Type = obj.TYPE_REG
   441  			m.From.Reg = x86.REG_AH
   442  			m.To.Type = obj.TYPE_REG
   443  			m.To.Reg = x86.REG_DX
   444  		}
   445  
   446  	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
   447  		// Arg[0] is already in AX as it's the only register we allow
   448  		// results lo in AX
   449  		p := s.Prog(v.Op.Asm())
   450  		p.From.Type = obj.TYPE_REG
   451  		p.From.Reg = v.Args[1].Reg()
   452  
   453  	case ssa.OpAMD64MULQU2:
   454  		// Arg[0] is already in AX as it's the only register we allow
   455  		// results hi in DX, lo in AX
   456  		p := s.Prog(v.Op.Asm())
   457  		p.From.Type = obj.TYPE_REG
   458  		p.From.Reg = v.Args[1].Reg()
   459  
   460  	case ssa.OpAMD64DIVQU2:
   461  		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
   462  		// results q in AX, r in DX
   463  		p := s.Prog(v.Op.Asm())
   464  		p.From.Type = obj.TYPE_REG
   465  		p.From.Reg = v.Args[2].Reg()
   466  
   467  	case ssa.OpAMD64AVGQU:
   468  		// compute (x+y)/2 unsigned.
   469  		// Do a 64-bit add, the overflow goes into the carry.
   470  		// Shift right once and pull the carry back into the 63rd bit.
   471  		p := s.Prog(x86.AADDQ)
   472  		p.From.Type = obj.TYPE_REG
   473  		p.To.Type = obj.TYPE_REG
   474  		p.To.Reg = v.Reg()
   475  		p.From.Reg = v.Args[1].Reg()
   476  		p = s.Prog(x86.ARCRQ)
   477  		p.From.Type = obj.TYPE_CONST
   478  		p.From.Offset = 1
   479  		p.To.Type = obj.TYPE_REG
   480  		p.To.Reg = v.Reg()
   481  
   482  	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
   483  		r := v.Reg0()
   484  		r0 := v.Args[0].Reg()
   485  		r1 := v.Args[1].Reg()
   486  		switch r {
   487  		case r0:
   488  			p := s.Prog(v.Op.Asm())
   489  			p.From.Type = obj.TYPE_REG
   490  			p.From.Reg = r1
   491  			p.To.Type = obj.TYPE_REG
   492  			p.To.Reg = r
   493  		case r1:
   494  			p := s.Prog(v.Op.Asm())
   495  			p.From.Type = obj.TYPE_REG
   496  			p.From.Reg = r0
   497  			p.To.Type = obj.TYPE_REG
   498  			p.To.Reg = r
   499  		default:
   500  			v.Fatalf("output not in same register as an input %s", v.LongString())
   501  		}
   502  
   503  	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
   504  		p := s.Prog(v.Op.Asm())
   505  		p.From.Type = obj.TYPE_REG
   506  		p.From.Reg = v.Args[1].Reg()
   507  		p.To.Type = obj.TYPE_REG
   508  		p.To.Reg = v.Reg0()
   509  
   510  	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
   511  		p := s.Prog(v.Op.Asm())
   512  		p.From.Type = obj.TYPE_CONST
   513  		p.From.Offset = v.AuxInt
   514  		p.To.Type = obj.TYPE_REG
   515  		p.To.Reg = v.Reg0()
   516  
   517  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   518  		r := v.Reg()
   519  		a := v.Args[0].Reg()
   520  		if r == a {
   521  			switch v.AuxInt {
   522  			case 1:
   523  				var asm obj.As
   524  				// Software optimization manual recommends add $1,reg.
   525  				// But inc/dec is 1 byte smaller. ICC always uses inc
   526  				// Clang/GCC choose depending on flags, but prefer add.
   527  				// Experiments show that inc/dec is both a little faster
   528  				// and make a binary a little smaller.
   529  				if v.Op == ssa.OpAMD64ADDQconst {
   530  					asm = x86.AINCQ
   531  				} else {
   532  					asm = x86.AINCL
   533  				}
   534  				p := s.Prog(asm)
   535  				p.To.Type = obj.TYPE_REG
   536  				p.To.Reg = r
   537  				return
   538  			case -1:
   539  				var asm obj.As
   540  				if v.Op == ssa.OpAMD64ADDQconst {
   541  					asm = x86.ADECQ
   542  				} else {
   543  					asm = x86.ADECL
   544  				}
   545  				p := s.Prog(asm)
   546  				p.To.Type = obj.TYPE_REG
   547  				p.To.Reg = r
   548  				return
   549  			case 0x80:
   550  				// 'SUBQ $-0x80, r' is shorter to encode than
   551  				// and functionally equivalent to 'ADDQ $0x80, r'.
   552  				asm := x86.ASUBL
   553  				if v.Op == ssa.OpAMD64ADDQconst {
   554  					asm = x86.ASUBQ
   555  				}
   556  				p := s.Prog(asm)
   557  				p.From.Type = obj.TYPE_CONST
   558  				p.From.Offset = -0x80
   559  				p.To.Type = obj.TYPE_REG
   560  				p.To.Reg = r
   561  				return
   562  
   563  			}
   564  			p := s.Prog(v.Op.Asm())
   565  			p.From.Type = obj.TYPE_CONST
   566  			p.From.Offset = v.AuxInt
   567  			p.To.Type = obj.TYPE_REG
   568  			p.To.Reg = r
   569  			return
   570  		}
   571  		var asm obj.As
   572  		if v.Op == ssa.OpAMD64ADDQconst {
   573  			asm = x86.ALEAQ
   574  		} else {
   575  			asm = x86.ALEAL
   576  		}
   577  		p := s.Prog(asm)
   578  		p.From.Type = obj.TYPE_MEM
   579  		p.From.Reg = a
   580  		p.From.Offset = v.AuxInt
   581  		p.To.Type = obj.TYPE_REG
   582  		p.To.Reg = r
   583  
   584  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
   585  		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
   586  		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
   587  		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
   588  		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
   589  		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
   590  		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
   591  		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
   592  		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
   593  		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
   594  		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
   595  		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
   596  		p := s.Prog(v.Op.Asm())
   597  		p.From.Type = obj.TYPE_REG
   598  		p.From.Reg = v.Args[1].Reg()
   599  		p.To.Type = obj.TYPE_REG
   600  		p.To.Reg = v.Reg()
   601  
   602  	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
   603  		// Flag condition: ^ZERO || PARITY
   604  		// Generate:
   605  		//   CMOV*NE  SRC,DST
   606  		//   CMOV*PS  SRC,DST
   607  		p := s.Prog(v.Op.Asm())
   608  		p.From.Type = obj.TYPE_REG
   609  		p.From.Reg = v.Args[1].Reg()
   610  		p.To.Type = obj.TYPE_REG
   611  		p.To.Reg = v.Reg()
   612  		var q *obj.Prog
   613  		if v.Op == ssa.OpAMD64CMOVQNEF {
   614  			q = s.Prog(x86.ACMOVQPS)
   615  		} else if v.Op == ssa.OpAMD64CMOVLNEF {
   616  			q = s.Prog(x86.ACMOVLPS)
   617  		} else {
   618  			q = s.Prog(x86.ACMOVWPS)
   619  		}
   620  		q.From.Type = obj.TYPE_REG
   621  		q.From.Reg = v.Args[1].Reg()
   622  		q.To.Type = obj.TYPE_REG
   623  		q.To.Reg = v.Reg()
   624  
   625  	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
   626  		// Flag condition: ZERO && !PARITY
   627  		// Generate:
   628  		//   MOV      SRC,TMP
   629  		//   CMOV*NE  DST,TMP
   630  		//   CMOV*PC  TMP,DST
   631  		//
   632  		// TODO(rasky): we could generate:
   633  		//   CMOV*NE  DST,SRC
   634  		//   CMOV*PC  SRC,DST
   635  		// But this requires a way for regalloc to know that SRC might be
   636  		// clobbered by this instruction.
   637  		t := v.RegTmp()
   638  		opregreg(s, moveByType(v.Type), t, v.Args[1].Reg())
   639  
   640  		p := s.Prog(v.Op.Asm())
   641  		p.From.Type = obj.TYPE_REG
   642  		p.From.Reg = v.Reg()
   643  		p.To.Type = obj.TYPE_REG
   644  		p.To.Reg = t
   645  		var q *obj.Prog
   646  		if v.Op == ssa.OpAMD64CMOVQEQF {
   647  			q = s.Prog(x86.ACMOVQPC)
   648  		} else if v.Op == ssa.OpAMD64CMOVLEQF {
   649  			q = s.Prog(x86.ACMOVLPC)
   650  		} else {
   651  			q = s.Prog(x86.ACMOVWPC)
   652  		}
   653  		q.From.Type = obj.TYPE_REG
   654  		q.From.Reg = t
   655  		q.To.Type = obj.TYPE_REG
   656  		q.To.Reg = v.Reg()
   657  
   658  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   659  		r := v.Reg()
   660  		p := s.Prog(v.Op.Asm())
   661  		p.From.Type = obj.TYPE_CONST
   662  		p.From.Offset = v.AuxInt
   663  		p.To.Type = obj.TYPE_REG
   664  		p.To.Reg = r
   665  		p.AddRestSourceReg(v.Args[0].Reg())
   666  
   667  	case ssa.OpAMD64ANDQconst:
   668  		asm := v.Op.Asm()
   669  		// If the constant is positive and fits into 32 bits, use ANDL.
   670  		// This saves a few bytes of encoding.
   671  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   672  			asm = x86.AANDL
   673  		}
   674  		p := s.Prog(asm)
   675  		p.From.Type = obj.TYPE_CONST
   676  		p.From.Offset = v.AuxInt
   677  		p.To.Type = obj.TYPE_REG
   678  		p.To.Reg = v.Reg()
   679  
   680  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   681  		ssa.OpAMD64ANDLconst,
   682  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   683  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   684  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   685  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   686  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   687  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   688  		p := s.Prog(v.Op.Asm())
   689  		p.From.Type = obj.TYPE_CONST
   690  		p.From.Offset = v.AuxInt
   691  		p.To.Type = obj.TYPE_REG
   692  		p.To.Reg = v.Reg()
   693  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   694  		r := v.Reg()
   695  		p := s.Prog(v.Op.Asm())
   696  		p.From.Type = obj.TYPE_REG
   697  		p.From.Reg = r
   698  		p.To.Type = obj.TYPE_REG
   699  		p.To.Reg = r
   700  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
   701  		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
   702  		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   703  		p := s.Prog(v.Op.Asm())
   704  		memIdx(&p.From, v)
   705  		o := v.Reg()
   706  		p.To.Type = obj.TYPE_REG
   707  		p.To.Reg = o
   708  		if v.AuxInt != 0 && v.Aux == nil {
   709  			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
   710  			switch v.Op {
   711  			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   712  				p = s.Prog(x86.ALEAQ)
   713  			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
   714  				p = s.Prog(x86.ALEAL)
   715  			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   716  				p = s.Prog(x86.ALEAW)
   717  			}
   718  			p.From.Type = obj.TYPE_MEM
   719  			p.From.Reg = o
   720  			p.To.Type = obj.TYPE_REG
   721  			p.To.Reg = o
   722  		}
   723  		ssagen.AddAux(&p.From, v)
   724  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
   725  		p := s.Prog(v.Op.Asm())
   726  		p.From.Type = obj.TYPE_MEM
   727  		p.From.Reg = v.Args[0].Reg()
   728  		ssagen.AddAux(&p.From, v)
   729  		p.To.Type = obj.TYPE_REG
   730  		p.To.Reg = v.Reg()
   731  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   732  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
   733  		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
   734  		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
   735  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   736  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   737  		// must account for that right here.
   738  		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
   739  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   740  		p := s.Prog(v.Op.Asm())
   741  		p.From.Type = obj.TYPE_REG
   742  		p.From.Reg = v.Args[0].Reg()
   743  		p.To.Type = obj.TYPE_CONST
   744  		p.To.Offset = v.AuxInt
   745  	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
   746  		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
   747  		ssa.OpAMD64BTSQconst,
   748  		ssa.OpAMD64BTCQconst,
   749  		ssa.OpAMD64BTRQconst:
   750  		op := v.Op
   751  		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
   752  			// Emit 32-bit version because it's shorter
   753  			op = ssa.OpAMD64BTLconst
   754  		}
   755  		p := s.Prog(op.Asm())
   756  		p.From.Type = obj.TYPE_CONST
   757  		p.From.Offset = v.AuxInt
   758  		p.To.Type = obj.TYPE_REG
   759  		p.To.Reg = v.Args[0].Reg()
   760  	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
   761  		p := s.Prog(v.Op.Asm())
   762  		p.From.Type = obj.TYPE_MEM
   763  		p.From.Reg = v.Args[0].Reg()
   764  		ssagen.AddAux(&p.From, v)
   765  		p.To.Type = obj.TYPE_REG
   766  		p.To.Reg = v.Args[1].Reg()
   767  	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
   768  		sc := v.AuxValAndOff()
   769  		p := s.Prog(v.Op.Asm())
   770  		p.From.Type = obj.TYPE_MEM
   771  		p.From.Reg = v.Args[0].Reg()
   772  		ssagen.AddAux2(&p.From, v, sc.Off64())
   773  		p.To.Type = obj.TYPE_CONST
   774  		p.To.Offset = sc.Val64()
   775  	case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1:
   776  		p := s.Prog(v.Op.Asm())
   777  		memIdx(&p.From, v)
   778  		ssagen.AddAux(&p.From, v)
   779  		p.To.Type = obj.TYPE_REG
   780  		p.To.Reg = v.Args[2].Reg()
   781  	case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1:
   782  		sc := v.AuxValAndOff()
   783  		p := s.Prog(v.Op.Asm())
   784  		memIdx(&p.From, v)
   785  		ssagen.AddAux2(&p.From, v, sc.Off64())
   786  		p.To.Type = obj.TYPE_CONST
   787  		p.To.Offset = sc.Val64()
   788  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   789  		x := v.Reg()
   790  
   791  		// If flags aren't live (indicated by v.Aux == nil),
   792  		// then we can rewrite MOV $0, AX into XOR AX, AX.
   793  		if v.AuxInt == 0 && v.Aux == nil {
   794  			opregreg(s, x86.AXORL, x, x)
   795  			break
   796  		}
   797  
   798  		asm := v.Op.Asm()
   799  		// Use MOVL to move a small constant into a register
   800  		// when the constant is positive and fits into 32 bits.
   801  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   802  			// The upper 32bit are zeroed automatically when using MOVL.
   803  			asm = x86.AMOVL
   804  		}
   805  		p := s.Prog(asm)
   806  		p.From.Type = obj.TYPE_CONST
   807  		p.From.Offset = v.AuxInt
   808  		p.To.Type = obj.TYPE_REG
   809  		p.To.Reg = x
   810  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   811  		x := v.Reg()
   812  		p := s.Prog(v.Op.Asm())
   813  		p.From.Type = obj.TYPE_FCONST
   814  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   815  		p.To.Type = obj.TYPE_REG
   816  		p.To.Reg = x
   817  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
   818  		ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
   819  		ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
   820  		p := s.Prog(v.Op.Asm())
   821  		p.From.Type = obj.TYPE_MEM
   822  		p.From.Reg = v.Args[0].Reg()
   823  		ssagen.AddAux(&p.From, v)
   824  		p.To.Type = obj.TYPE_REG
   825  		p.To.Reg = v.Reg()
   826  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
   827  		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2,
   828  		ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8:
   829  		p := s.Prog(v.Op.Asm())
   830  		memIdx(&p.From, v)
   831  		ssagen.AddAux(&p.From, v)
   832  		p.To.Type = obj.TYPE_REG
   833  		p.To.Reg = v.Reg()
   834  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
   835  		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
   836  		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
   837  		ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore:
   838  		p := s.Prog(v.Op.Asm())
   839  		p.From.Type = obj.TYPE_REG
   840  		p.From.Reg = v.Args[1].Reg()
   841  		p.To.Type = obj.TYPE_MEM
   842  		p.To.Reg = v.Args[0].Reg()
   843  		ssagen.AddAux(&p.To, v)
   844  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
   845  		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
   846  		ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
   847  		ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
   848  		ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
   849  		ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
   850  		ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8,
   851  		ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8:
   852  		p := s.Prog(v.Op.Asm())
   853  		p.From.Type = obj.TYPE_REG
   854  		p.From.Reg = v.Args[2].Reg()
   855  		memIdx(&p.To, v)
   856  		ssagen.AddAux(&p.To, v)
   857  	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
   858  		sc := v.AuxValAndOff()
   859  		off := sc.Off64()
   860  		val := sc.Val()
   861  		if val == 1 || val == -1 {
   862  			var asm obj.As
   863  			if v.Op == ssa.OpAMD64ADDQconstmodify {
   864  				if val == 1 {
   865  					asm = x86.AINCQ
   866  				} else {
   867  					asm = x86.ADECQ
   868  				}
   869  			} else {
   870  				if val == 1 {
   871  					asm = x86.AINCL
   872  				} else {
   873  					asm = x86.ADECL
   874  				}
   875  			}
   876  			p := s.Prog(asm)
   877  			p.To.Type = obj.TYPE_MEM
   878  			p.To.Reg = v.Args[0].Reg()
   879  			ssagen.AddAux2(&p.To, v, off)
   880  			break
   881  		}
   882  		fallthrough
   883  	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
   884  		ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
   885  		ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
   886  		sc := v.AuxValAndOff()
   887  		off := sc.Off64()
   888  		val := sc.Val64()
   889  		p := s.Prog(v.Op.Asm())
   890  		p.From.Type = obj.TYPE_CONST
   891  		p.From.Offset = val
   892  		p.To.Type = obj.TYPE_MEM
   893  		p.To.Reg = v.Args[0].Reg()
   894  		ssagen.AddAux2(&p.To, v, off)
   895  
   896  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   897  		p := s.Prog(v.Op.Asm())
   898  		p.From.Type = obj.TYPE_CONST
   899  		sc := v.AuxValAndOff()
   900  		p.From.Offset = sc.Val64()
   901  		p.To.Type = obj.TYPE_MEM
   902  		p.To.Reg = v.Args[0].Reg()
   903  		ssagen.AddAux2(&p.To, v, sc.Off64())
   904  	case ssa.OpAMD64MOVOstoreconst:
   905  		sc := v.AuxValAndOff()
   906  		if sc.Val() != 0 {
   907  			v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString())
   908  		}
   909  
   910  		if s.ABI != obj.ABIInternal {
   911  			// zero X15 manually
   912  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   913  		}
   914  		p := s.Prog(v.Op.Asm())
   915  		p.From.Type = obj.TYPE_REG
   916  		p.From.Reg = x86.REG_X15
   917  		p.To.Type = obj.TYPE_MEM
   918  		p.To.Reg = v.Args[0].Reg()
   919  		ssagen.AddAux2(&p.To, v, sc.Off64())
   920  
   921  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
   922  		ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
   923  		ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
   924  		ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
   925  		ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
   926  		p := s.Prog(v.Op.Asm())
   927  		p.From.Type = obj.TYPE_CONST
   928  		sc := v.AuxValAndOff()
   929  		p.From.Offset = sc.Val64()
   930  		switch {
   931  		case p.As == x86.AADDQ && p.From.Offset == 1:
   932  			p.As = x86.AINCQ
   933  			p.From.Type = obj.TYPE_NONE
   934  		case p.As == x86.AADDQ && p.From.Offset == -1:
   935  			p.As = x86.ADECQ
   936  			p.From.Type = obj.TYPE_NONE
   937  		case p.As == x86.AADDL && p.From.Offset == 1:
   938  			p.As = x86.AINCL
   939  			p.From.Type = obj.TYPE_NONE
   940  		case p.As == x86.AADDL && p.From.Offset == -1:
   941  			p.As = x86.ADECL
   942  			p.From.Type = obj.TYPE_NONE
   943  		}
   944  		memIdx(&p.To, v)
   945  		ssagen.AddAux2(&p.To, v, sc.Off64())
   946  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   947  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   948  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
   949  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
   950  	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
   951  		r := v.Reg()
   952  		// Break false dependency on destination register.
   953  		opregreg(s, x86.AXORPS, r, r)
   954  		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
   955  	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   956  		var p *obj.Prog
   957  		switch v.Op {
   958  		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
   959  			p = s.Prog(x86.AMOVQ)
   960  		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   961  			p = s.Prog(x86.AMOVL)
   962  		}
   963  		p.From.Type = obj.TYPE_REG
   964  		p.From.Reg = v.Args[0].Reg()
   965  		p.To.Type = obj.TYPE_REG
   966  		p.To.Reg = v.Reg()
   967  	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
   968  		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
   969  		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
   970  		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
   971  		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
   972  		p := s.Prog(v.Op.Asm())
   973  		p.From.Type = obj.TYPE_MEM
   974  		p.From.Reg = v.Args[1].Reg()
   975  		ssagen.AddAux(&p.From, v)
   976  		p.To.Type = obj.TYPE_REG
   977  		p.To.Reg = v.Reg()
   978  	case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
   979  		ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
   980  		ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
   981  		ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
   982  		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
   983  		ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
   984  		ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
   985  		ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
   986  		ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
   987  		p := s.Prog(v.Op.Asm())
   988  
   989  		r, i := v.Args[1].Reg(), v.Args[2].Reg()
   990  		p.From.Type = obj.TYPE_MEM
   991  		p.From.Scale = v.Op.Scale()
   992  		if p.From.Scale == 1 && i == x86.REG_SP {
   993  			r, i = i, r
   994  		}
   995  		p.From.Reg = r
   996  		p.From.Index = i
   997  
   998  		ssagen.AddAux(&p.From, v)
   999  		p.To.Type = obj.TYPE_REG
  1000  		p.To.Reg = v.Reg()
  1001  	case ssa.OpAMD64DUFFZERO:
  1002  		if s.ABI != obj.ABIInternal {
  1003  			// zero X15 manually
  1004  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1005  		}
  1006  		off := duffStart(v.AuxInt)
  1007  		adj := duffAdj(v.AuxInt)
  1008  		var p *obj.Prog
  1009  		if adj != 0 {
  1010  			p = s.Prog(x86.ALEAQ)
  1011  			p.From.Type = obj.TYPE_MEM
  1012  			p.From.Offset = adj
  1013  			p.From.Reg = x86.REG_DI
  1014  			p.To.Type = obj.TYPE_REG
  1015  			p.To.Reg = x86.REG_DI
  1016  		}
  1017  		p = s.Prog(obj.ADUFFZERO)
  1018  		p.To.Type = obj.TYPE_ADDR
  1019  		p.To.Sym = ir.Syms.Duffzero
  1020  		p.To.Offset = off
  1021  	case ssa.OpAMD64DUFFCOPY:
  1022  		p := s.Prog(obj.ADUFFCOPY)
  1023  		p.To.Type = obj.TYPE_ADDR
  1024  		p.To.Sym = ir.Syms.Duffcopy
  1025  		if v.AuxInt%16 != 0 {
  1026  			v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
  1027  		}
  1028  		p.To.Offset = 14 * (64 - v.AuxInt/16)
  1029  		// 14 and 64 are magic constants.  14 is the number of bytes to encode:
  1030  		//	MOVUPS	(SI), X0
  1031  		//	ADDQ	$16, SI
  1032  		//	MOVUPS	X0, (DI)
  1033  		//	ADDQ	$16, DI
  1034  		// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
  1035  
  1036  	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
  1037  		if v.Type.IsMemory() {
  1038  			return
  1039  		}
  1040  		x := v.Args[0].Reg()
  1041  		y := v.Reg()
  1042  		if x != y {
  1043  			opregreg(s, moveByType(v.Type), y, x)
  1044  		}
  1045  	case ssa.OpLoadReg:
  1046  		if v.Type.IsFlags() {
  1047  			v.Fatalf("load flags not implemented: %v", v.LongString())
  1048  			return
  1049  		}
  1050  		p := s.Prog(loadByType(v.Type))
  1051  		ssagen.AddrAuto(&p.From, v.Args[0])
  1052  		p.To.Type = obj.TYPE_REG
  1053  		p.To.Reg = v.Reg()
  1054  
  1055  	case ssa.OpStoreReg:
  1056  		if v.Type.IsFlags() {
  1057  			v.Fatalf("store flags not implemented: %v", v.LongString())
  1058  			return
  1059  		}
  1060  		p := s.Prog(storeByType(v.Type))
  1061  		p.From.Type = obj.TYPE_REG
  1062  		p.From.Reg = v.Args[0].Reg()
  1063  		ssagen.AddrAuto(&p.To, v)
  1064  	case ssa.OpAMD64LoweredHasCPUFeature:
  1065  		p := s.Prog(x86.AMOVBLZX)
  1066  		p.From.Type = obj.TYPE_MEM
  1067  		ssagen.AddAux(&p.From, v)
  1068  		p.To.Type = obj.TYPE_REG
  1069  		p.To.Reg = v.Reg()
  1070  	case ssa.OpArgIntReg, ssa.OpArgFloatReg:
  1071  		// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
  1072  		// The loop only runs once.
  1073  		for _, ap := range v.Block.Func.RegArgs {
  1074  			// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
  1075  			addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
  1076  			s.FuncInfo().AddSpill(
  1077  				obj.RegSpill{Reg: ap.Reg, Addr: addr, Unspill: loadByType(ap.Type), Spill: storeByType(ap.Type)})
  1078  		}
  1079  		v.Block.Func.RegArgs = nil
  1080  		ssagen.CheckArgReg(v)
  1081  	case ssa.OpAMD64LoweredGetClosurePtr:
  1082  		// Closure pointer is DX.
  1083  		ssagen.CheckLoweredGetClosurePtr(v)
  1084  	case ssa.OpAMD64LoweredGetG:
  1085  		if s.ABI == obj.ABIInternal {
  1086  			v.Fatalf("LoweredGetG should not appear in ABIInternal")
  1087  		}
  1088  		r := v.Reg()
  1089  		getgFromTLS(s, r)
  1090  	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
  1091  		if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
  1092  			// zeroing X15 when entering ABIInternal from ABI0
  1093  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1094  			// set G register from TLS
  1095  			getgFromTLS(s, x86.REG_R14)
  1096  		}
  1097  		if v.Op == ssa.OpAMD64CALLtail {
  1098  			s.TailCall(v)
  1099  			break
  1100  		}
  1101  		s.Call(v)
  1102  		if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
  1103  			// zeroing X15 when entering ABIInternal from ABI0
  1104  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1105  			// set G register from TLS
  1106  			getgFromTLS(s, x86.REG_R14)
  1107  		}
  1108  	case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
  1109  		s.Call(v)
  1110  
  1111  	case ssa.OpAMD64LoweredGetCallerPC:
  1112  		p := s.Prog(x86.AMOVQ)
  1113  		p.From.Type = obj.TYPE_MEM
  1114  		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
  1115  		p.From.Name = obj.NAME_PARAM
  1116  		p.To.Type = obj.TYPE_REG
  1117  		p.To.Reg = v.Reg()
  1118  
  1119  	case ssa.OpAMD64LoweredGetCallerSP:
  1120  		// caller's SP is the address of the first arg
  1121  		mov := x86.AMOVQ
  1122  		if types.PtrSize == 4 {
  1123  			mov = x86.AMOVL
  1124  		}
  1125  		p := s.Prog(mov)
  1126  		p.From.Type = obj.TYPE_ADDR
  1127  		p.From.Offset = -base.Ctxt.Arch.FixedFrameSize // 0 on amd64, just to be consistent with other architectures
  1128  		p.From.Name = obj.NAME_PARAM
  1129  		p.To.Type = obj.TYPE_REG
  1130  		p.To.Reg = v.Reg()
  1131  
  1132  	case ssa.OpAMD64LoweredWB:
  1133  		p := s.Prog(obj.ACALL)
  1134  		p.To.Type = obj.TYPE_MEM
  1135  		p.To.Name = obj.NAME_EXTERN
  1136  		// AuxInt encodes how many buffer entries we need.
  1137  		p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
  1138  
  1139  	case ssa.OpAMD64LoweredPanicBoundsRR, ssa.OpAMD64LoweredPanicBoundsRC, ssa.OpAMD64LoweredPanicBoundsCR, ssa.OpAMD64LoweredPanicBoundsCC:
  1140  		// Compute the constant we put in the PCData entry for this call.
  1141  		code, signed := ssa.BoundsKind(v.AuxInt).Code()
  1142  		xIsReg := false
  1143  		yIsReg := false
  1144  		xVal := 0
  1145  		yVal := 0
  1146  		switch v.Op {
  1147  		case ssa.OpAMD64LoweredPanicBoundsRR:
  1148  			xIsReg = true
  1149  			xVal = int(v.Args[0].Reg() - x86.REG_AX)
  1150  			yIsReg = true
  1151  			yVal = int(v.Args[1].Reg() - x86.REG_AX)
  1152  		case ssa.OpAMD64LoweredPanicBoundsRC:
  1153  			xIsReg = true
  1154  			xVal = int(v.Args[0].Reg() - x86.REG_AX)
  1155  			c := v.Aux.(ssa.PanicBoundsC).C
  1156  			if c >= 0 && c <= abi.BoundsMaxConst {
  1157  				yVal = int(c)
  1158  			} else {
  1159  				// Move constant to a register
  1160  				yIsReg = true
  1161  				if yVal == xVal {
  1162  					yVal = 1
  1163  				}
  1164  				p := s.Prog(x86.AMOVQ)
  1165  				p.From.Type = obj.TYPE_CONST
  1166  				p.From.Offset = c
  1167  				p.To.Type = obj.TYPE_REG
  1168  				p.To.Reg = x86.REG_AX + int16(yVal)
  1169  			}
  1170  		case ssa.OpAMD64LoweredPanicBoundsCR:
  1171  			yIsReg = true
  1172  			yVal := int(v.Args[0].Reg() - x86.REG_AX)
  1173  			c := v.Aux.(ssa.PanicBoundsC).C
  1174  			if c >= 0 && c <= abi.BoundsMaxConst {
  1175  				xVal = int(c)
  1176  			} else {
  1177  				// Move constant to a register
  1178  				xIsReg = true
  1179  				if xVal == yVal {
  1180  					xVal = 1
  1181  				}
  1182  				p := s.Prog(x86.AMOVQ)
  1183  				p.From.Type = obj.TYPE_CONST
  1184  				p.From.Offset = c
  1185  				p.To.Type = obj.TYPE_REG
  1186  				p.To.Reg = x86.REG_AX + int16(xVal)
  1187  			}
  1188  		case ssa.OpAMD64LoweredPanicBoundsCC:
  1189  			c := v.Aux.(ssa.PanicBoundsCC).Cx
  1190  			if c >= 0 && c <= abi.BoundsMaxConst {
  1191  				xVal = int(c)
  1192  			} else {
  1193  				// Move constant to a register
  1194  				xIsReg = true
  1195  				p := s.Prog(x86.AMOVQ)
  1196  				p.From.Type = obj.TYPE_CONST
  1197  				p.From.Offset = c
  1198  				p.To.Type = obj.TYPE_REG
  1199  				p.To.Reg = x86.REG_AX + int16(xVal)
  1200  			}
  1201  			c = v.Aux.(ssa.PanicBoundsCC).Cy
  1202  			if c >= 0 && c <= abi.BoundsMaxConst {
  1203  				yVal = int(c)
  1204  			} else {
  1205  				// Move constant to a register
  1206  				yIsReg = true
  1207  				yVal = 1
  1208  				p := s.Prog(x86.AMOVQ)
  1209  				p.From.Type = obj.TYPE_CONST
  1210  				p.From.Offset = c
  1211  				p.To.Type = obj.TYPE_REG
  1212  				p.To.Reg = x86.REG_AX + int16(yVal)
  1213  			}
  1214  		}
  1215  		c := abi.BoundsEncode(code, signed, xIsReg, yIsReg, xVal, yVal)
  1216  
  1217  		p := s.Prog(obj.APCDATA)
  1218  		p.From.SetConst(abi.PCDATA_PanicBounds)
  1219  		p.To.SetConst(int64(c))
  1220  		p = s.Prog(obj.ACALL)
  1221  		p.To.Type = obj.TYPE_MEM
  1222  		p.To.Name = obj.NAME_EXTERN
  1223  		p.To.Sym = ir.Syms.PanicBounds
  1224  
  1225  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
  1226  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
  1227  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
  1228  		p := s.Prog(v.Op.Asm())
  1229  		p.To.Type = obj.TYPE_REG
  1230  		p.To.Reg = v.Reg()
  1231  
  1232  	case ssa.OpAMD64NEGLflags:
  1233  		p := s.Prog(v.Op.Asm())
  1234  		p.To.Type = obj.TYPE_REG
  1235  		p.To.Reg = v.Reg0()
  1236  
  1237  	case ssa.OpAMD64ADDQconstflags, ssa.OpAMD64ADDLconstflags:
  1238  		p := s.Prog(v.Op.Asm())
  1239  		p.From.Type = obj.TYPE_CONST
  1240  		p.From.Offset = v.AuxInt
  1241  		// Note: the inc/dec instructions do not modify
  1242  		// the carry flag like add$1 / sub$1 do.
  1243  		// We currently never use the CF/OF flags from
  1244  		// these instructions, so that is ok.
  1245  		switch {
  1246  		case p.As == x86.AADDQ && p.From.Offset == 1:
  1247  			p.As = x86.AINCQ
  1248  			p.From.Type = obj.TYPE_NONE
  1249  		case p.As == x86.AADDQ && p.From.Offset == -1:
  1250  			p.As = x86.ADECQ
  1251  			p.From.Type = obj.TYPE_NONE
  1252  		case p.As == x86.AADDL && p.From.Offset == 1:
  1253  			p.As = x86.AINCL
  1254  			p.From.Type = obj.TYPE_NONE
  1255  		case p.As == x86.AADDL && p.From.Offset == -1:
  1256  			p.As = x86.ADECL
  1257  			p.From.Type = obj.TYPE_NONE
  1258  		}
  1259  		p.To.Type = obj.TYPE_REG
  1260  		p.To.Reg = v.Reg0()
  1261  
  1262  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1263  		p := s.Prog(v.Op.Asm())
  1264  		p.From.Type = obj.TYPE_REG
  1265  		p.From.Reg = v.Args[0].Reg()
  1266  		p.To.Type = obj.TYPE_REG
  1267  		switch v.Op {
  1268  		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
  1269  			p.To.Reg = v.Reg0()
  1270  		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1271  			p.To.Reg = v.Reg()
  1272  		}
  1273  	case ssa.OpAMD64LoweredRound32F, ssa.OpAMD64LoweredRound64F:
  1274  		// input is already rounded
  1275  	case ssa.OpAMD64ROUNDSD:
  1276  		p := s.Prog(v.Op.Asm())
  1277  		val := v.AuxInt
  1278  		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
  1279  		if val < 0 || val > 3 {
  1280  			v.Fatalf("Invalid rounding mode")
  1281  		}
  1282  		p.From.Offset = val
  1283  		p.From.Type = obj.TYPE_CONST
  1284  		p.AddRestSourceReg(v.Args[0].Reg())
  1285  		p.To.Type = obj.TYPE_REG
  1286  		p.To.Reg = v.Reg()
  1287  	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
  1288  		ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
  1289  		ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
  1290  		if v.Args[0].Reg() != v.Reg() {
  1291  			// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
  1292  			// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
  1293  			// Xor register with itself to break the dependency.
  1294  			opregreg(s, x86.AXORL, v.Reg(), v.Reg())
  1295  		}
  1296  		p := s.Prog(v.Op.Asm())
  1297  		p.From.Type = obj.TYPE_REG
  1298  		p.From.Reg = v.Args[0].Reg()
  1299  		p.To.Type = obj.TYPE_REG
  1300  		p.To.Reg = v.Reg()
  1301  
  1302  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
  1303  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
  1304  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
  1305  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
  1306  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
  1307  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
  1308  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
  1309  		ssa.OpAMD64SETO:
  1310  		p := s.Prog(v.Op.Asm())
  1311  		p.To.Type = obj.TYPE_REG
  1312  		p.To.Reg = v.Reg()
  1313  
  1314  	case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
  1315  		ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
  1316  		ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
  1317  		ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
  1318  		ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
  1319  		p := s.Prog(v.Op.Asm())
  1320  		p.To.Type = obj.TYPE_MEM
  1321  		p.To.Reg = v.Args[0].Reg()
  1322  		ssagen.AddAux(&p.To, v)
  1323  
  1324  	case ssa.OpAMD64SETEQstoreidx1, ssa.OpAMD64SETNEstoreidx1,
  1325  		ssa.OpAMD64SETLstoreidx1, ssa.OpAMD64SETLEstoreidx1,
  1326  		ssa.OpAMD64SETGstoreidx1, ssa.OpAMD64SETGEstoreidx1,
  1327  		ssa.OpAMD64SETBstoreidx1, ssa.OpAMD64SETBEstoreidx1,
  1328  		ssa.OpAMD64SETAstoreidx1, ssa.OpAMD64SETAEstoreidx1:
  1329  		p := s.Prog(v.Op.Asm())
  1330  		memIdx(&p.To, v)
  1331  		ssagen.AddAux(&p.To, v)
  1332  
  1333  	case ssa.OpAMD64SETNEF:
  1334  		t := v.RegTmp()
  1335  		p := s.Prog(v.Op.Asm())
  1336  		p.To.Type = obj.TYPE_REG
  1337  		p.To.Reg = v.Reg()
  1338  		q := s.Prog(x86.ASETPS)
  1339  		q.To.Type = obj.TYPE_REG
  1340  		q.To.Reg = t
  1341  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
  1342  		opregreg(s, x86.AORL, v.Reg(), t)
  1343  
  1344  	case ssa.OpAMD64SETEQF:
  1345  		t := v.RegTmp()
  1346  		p := s.Prog(v.Op.Asm())
  1347  		p.To.Type = obj.TYPE_REG
  1348  		p.To.Reg = v.Reg()
  1349  		q := s.Prog(x86.ASETPC)
  1350  		q.To.Type = obj.TYPE_REG
  1351  		q.To.Reg = t
  1352  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
  1353  		opregreg(s, x86.AANDL, v.Reg(), t)
  1354  
  1355  	case ssa.OpAMD64InvertFlags:
  1356  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1357  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
  1358  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1359  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
  1360  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
  1361  	case ssa.OpAMD64REPSTOSQ:
  1362  		s.Prog(x86.AREP)
  1363  		s.Prog(x86.ASTOSQ)
  1364  	case ssa.OpAMD64REPMOVSQ:
  1365  		s.Prog(x86.AREP)
  1366  		s.Prog(x86.AMOVSQ)
  1367  	case ssa.OpAMD64LoweredNilCheck:
  1368  		// Issue a load which will fault if the input is nil.
  1369  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
  1370  		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
  1371  		// but it doesn't have false dependency on AX.
  1372  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
  1373  		// That trades clobbering flags for clobbering a register.
  1374  		p := s.Prog(x86.ATESTB)
  1375  		p.From.Type = obj.TYPE_REG
  1376  		p.From.Reg = x86.REG_AX
  1377  		p.To.Type = obj.TYPE_MEM
  1378  		p.To.Reg = v.Args[0].Reg()
  1379  		if logopt.Enabled() {
  1380  			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
  1381  		}
  1382  		if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
  1383  			base.WarnfAt(v.Pos, "generated nil check")
  1384  		}
  1385  	case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
  1386  		p := s.Prog(v.Op.Asm())
  1387  		p.From.Type = obj.TYPE_MEM
  1388  		p.From.Reg = v.Args[0].Reg()
  1389  		ssagen.AddAux(&p.From, v)
  1390  		p.To.Type = obj.TYPE_REG
  1391  		p.To.Reg = v.Reg0()
  1392  	case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
  1393  		p := s.Prog(v.Op.Asm())
  1394  		p.From.Type = obj.TYPE_REG
  1395  		p.From.Reg = v.Reg0()
  1396  		p.To.Type = obj.TYPE_MEM
  1397  		p.To.Reg = v.Args[1].Reg()
  1398  		ssagen.AddAux(&p.To, v)
  1399  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1400  		s.Prog(x86.ALOCK)
  1401  		p := s.Prog(v.Op.Asm())
  1402  		p.From.Type = obj.TYPE_REG
  1403  		p.From.Reg = v.Reg0()
  1404  		p.To.Type = obj.TYPE_MEM
  1405  		p.To.Reg = v.Args[1].Reg()
  1406  		ssagen.AddAux(&p.To, v)
  1407  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1408  		if v.Args[1].Reg() != x86.REG_AX {
  1409  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1410  		}
  1411  		s.Prog(x86.ALOCK)
  1412  		p := s.Prog(v.Op.Asm())
  1413  		p.From.Type = obj.TYPE_REG
  1414  		p.From.Reg = v.Args[2].Reg()
  1415  		p.To.Type = obj.TYPE_MEM
  1416  		p.To.Reg = v.Args[0].Reg()
  1417  		ssagen.AddAux(&p.To, v)
  1418  		p = s.Prog(x86.ASETEQ)
  1419  		p.To.Type = obj.TYPE_REG
  1420  		p.To.Reg = v.Reg0()
  1421  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock:
  1422  		// Atomic memory operations that don't need to return the old value.
  1423  		s.Prog(x86.ALOCK)
  1424  		p := s.Prog(v.Op.Asm())
  1425  		p.From.Type = obj.TYPE_REG
  1426  		p.From.Reg = v.Args[1].Reg()
  1427  		p.To.Type = obj.TYPE_MEM
  1428  		p.To.Reg = v.Args[0].Reg()
  1429  		ssagen.AddAux(&p.To, v)
  1430  	case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32:
  1431  		// Atomic memory operations that need to return the old value.
  1432  		// We need to do these with compare-and-exchange to get access to the old value.
  1433  		// loop:
  1434  		// MOVQ mask, tmp
  1435  		// MOVQ (addr), AX
  1436  		// ANDQ AX, tmp
  1437  		// LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against
  1438  		// JNE loop
  1439  		// : result in AX
  1440  		mov := x86.AMOVQ
  1441  		op := x86.AANDQ
  1442  		cmpxchg := x86.ACMPXCHGQ
  1443  		switch v.Op {
  1444  		case ssa.OpAMD64LoweredAtomicOr64:
  1445  			op = x86.AORQ
  1446  		case ssa.OpAMD64LoweredAtomicAnd32:
  1447  			mov = x86.AMOVL
  1448  			op = x86.AANDL
  1449  			cmpxchg = x86.ACMPXCHGL
  1450  		case ssa.OpAMD64LoweredAtomicOr32:
  1451  			mov = x86.AMOVL
  1452  			op = x86.AORL
  1453  			cmpxchg = x86.ACMPXCHGL
  1454  		}
  1455  		addr := v.Args[0].Reg()
  1456  		mask := v.Args[1].Reg()
  1457  		tmp := v.RegTmp()
  1458  		p1 := s.Prog(mov)
  1459  		p1.From.Type = obj.TYPE_REG
  1460  		p1.From.Reg = mask
  1461  		p1.To.Type = obj.TYPE_REG
  1462  		p1.To.Reg = tmp
  1463  		p2 := s.Prog(mov)
  1464  		p2.From.Type = obj.TYPE_MEM
  1465  		p2.From.Reg = addr
  1466  		ssagen.AddAux(&p2.From, v)
  1467  		p2.To.Type = obj.TYPE_REG
  1468  		p2.To.Reg = x86.REG_AX
  1469  		p3 := s.Prog(op)
  1470  		p3.From.Type = obj.TYPE_REG
  1471  		p3.From.Reg = x86.REG_AX
  1472  		p3.To.Type = obj.TYPE_REG
  1473  		p3.To.Reg = tmp
  1474  		s.Prog(x86.ALOCK)
  1475  		p5 := s.Prog(cmpxchg)
  1476  		p5.From.Type = obj.TYPE_REG
  1477  		p5.From.Reg = tmp
  1478  		p5.To.Type = obj.TYPE_MEM
  1479  		p5.To.Reg = addr
  1480  		ssagen.AddAux(&p5.To, v)
  1481  		p6 := s.Prog(x86.AJNE)
  1482  		p6.To.Type = obj.TYPE_BRANCH
  1483  		p6.To.SetTarget(p1)
  1484  	case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
  1485  		p := s.Prog(v.Op.Asm())
  1486  		p.From.Type = obj.TYPE_MEM
  1487  		p.From.Reg = v.Args[0].Reg()
  1488  	case ssa.OpClobber:
  1489  		p := s.Prog(x86.AMOVL)
  1490  		p.From.Type = obj.TYPE_CONST
  1491  		p.From.Offset = 0xdeaddead
  1492  		p.To.Type = obj.TYPE_MEM
  1493  		p.To.Reg = x86.REG_SP
  1494  		ssagen.AddAux(&p.To, v)
  1495  		p = s.Prog(x86.AMOVL)
  1496  		p.From.Type = obj.TYPE_CONST
  1497  		p.From.Offset = 0xdeaddead
  1498  		p.To.Type = obj.TYPE_MEM
  1499  		p.To.Reg = x86.REG_SP
  1500  		ssagen.AddAux(&p.To, v)
  1501  		p.To.Offset += 4
  1502  	case ssa.OpClobberReg:
  1503  		x := uint64(0xdeaddeaddeaddead)
  1504  		p := s.Prog(x86.AMOVQ)
  1505  		p.From.Type = obj.TYPE_CONST
  1506  		p.From.Offset = int64(x)
  1507  		p.To.Type = obj.TYPE_REG
  1508  		p.To.Reg = v.Reg()
  1509  	default:
  1510  		v.Fatalf("genValue not implemented: %s", v.LongString())
  1511  	}
  1512  }
  1513  
  1514  var blockJump = [...]struct {
  1515  	asm, invasm obj.As
  1516  }{
  1517  	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
  1518  	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
  1519  	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
  1520  	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
  1521  	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
  1522  	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
  1523  	ssa.BlockAMD64OS:  {x86.AJOS, x86.AJOC},
  1524  	ssa.BlockAMD64OC:  {x86.AJOC, x86.AJOS},
  1525  	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
  1526  	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
  1527  	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
  1528  	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
  1529  	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
  1530  	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
  1531  }
  1532  
  1533  var eqfJumps = [2][2]ssagen.IndexJump{
  1534  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
  1535  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
  1536  }
  1537  var nefJumps = [2][2]ssagen.IndexJump{
  1538  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
  1539  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
  1540  }
  1541  
  1542  func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
  1543  	switch b.Kind {
  1544  	case ssa.BlockPlain, ssa.BlockDefer:
  1545  		if b.Succs[0].Block() != next {
  1546  			p := s.Prog(obj.AJMP)
  1547  			p.To.Type = obj.TYPE_BRANCH
  1548  			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
  1549  		}
  1550  	case ssa.BlockExit, ssa.BlockRetJmp:
  1551  	case ssa.BlockRet:
  1552  		s.Prog(obj.ARET)
  1553  
  1554  	case ssa.BlockAMD64EQF:
  1555  		s.CombJump(b, next, &eqfJumps)
  1556  
  1557  	case ssa.BlockAMD64NEF:
  1558  		s.CombJump(b, next, &nefJumps)
  1559  
  1560  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  1561  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  1562  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  1563  		ssa.BlockAMD64OS, ssa.BlockAMD64OC,
  1564  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  1565  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  1566  		jmp := blockJump[b.Kind]
  1567  		switch next {
  1568  		case b.Succs[0].Block():
  1569  			s.Br(jmp.invasm, b.Succs[1].Block())
  1570  		case b.Succs[1].Block():
  1571  			s.Br(jmp.asm, b.Succs[0].Block())
  1572  		default:
  1573  			if b.Likely != ssa.BranchUnlikely {
  1574  				s.Br(jmp.asm, b.Succs[0].Block())
  1575  				s.Br(obj.AJMP, b.Succs[1].Block())
  1576  			} else {
  1577  				s.Br(jmp.invasm, b.Succs[1].Block())
  1578  				s.Br(obj.AJMP, b.Succs[0].Block())
  1579  			}
  1580  		}
  1581  
  1582  	case ssa.BlockAMD64JUMPTABLE:
  1583  		// JMP      *(TABLE)(INDEX*8)
  1584  		p := s.Prog(obj.AJMP)
  1585  		p.To.Type = obj.TYPE_MEM
  1586  		p.To.Reg = b.Controls[1].Reg()
  1587  		p.To.Index = b.Controls[0].Reg()
  1588  		p.To.Scale = 8
  1589  		// Save jump tables for later resolution of the target blocks.
  1590  		s.JumpTables = append(s.JumpTables, b)
  1591  
  1592  	default:
  1593  		b.Fatalf("branch not implemented: %s", b.LongString())
  1594  	}
  1595  }
  1596  
  1597  func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  1598  	p := s.Prog(loadByType(t))
  1599  	p.From.Type = obj.TYPE_MEM
  1600  	p.From.Name = obj.NAME_AUTO
  1601  	p.From.Sym = n.Linksym()
  1602  	p.From.Offset = n.FrameOffset() + off
  1603  	p.To.Type = obj.TYPE_REG
  1604  	p.To.Reg = reg
  1605  	return p
  1606  }
  1607  
  1608  func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  1609  	p = pp.Append(p, storeByType(t), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
  1610  	p.To.Name = obj.NAME_PARAM
  1611  	p.To.Sym = n.Linksym()
  1612  	p.Pos = p.Pos.WithNotStmt()
  1613  	return p
  1614  }
  1615  

View as plain text