Source file src/cmd/compile/internal/amd64/ssa.go

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"cmd/compile/internal/base"
    12  	"cmd/compile/internal/ir"
    13  	"cmd/compile/internal/logopt"
    14  	"cmd/compile/internal/objw"
    15  	"cmd/compile/internal/ssa"
    16  	"cmd/compile/internal/ssagen"
    17  	"cmd/compile/internal/types"
    18  	"cmd/internal/obj"
    19  	"cmd/internal/obj/x86"
    20  )
    21  
    22  // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
    23  func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
    24  	flive := b.FlagsLiveAtEnd
    25  	for _, c := range b.ControlValues() {
    26  		flive = c.Type.IsFlags() || flive
    27  	}
    28  	for i := len(b.Values) - 1; i >= 0; i-- {
    29  		v := b.Values[i]
    30  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    31  			// The "mark" is any non-nil Aux value.
    32  			v.Aux = ssa.AuxMark
    33  		}
    34  		if v.Type.IsFlags() {
    35  			flive = false
    36  		}
    37  		for _, a := range v.Args {
    38  			if a.Type.IsFlags() {
    39  				flive = true
    40  			}
    41  		}
    42  	}
    43  }
    44  
    45  // loadByType returns the load instruction of the given type.
    46  func loadByType(t *types.Type) obj.As {
    47  	// Avoid partial register write
    48  	if !t.IsFloat() {
    49  		switch t.Size() {
    50  		case 1:
    51  			return x86.AMOVBLZX
    52  		case 2:
    53  			return x86.AMOVWLZX
    54  		}
    55  	}
    56  	// Otherwise, there's no difference between load and store opcodes.
    57  	return storeByType(t)
    58  }
    59  
    60  // storeByType returns the store instruction of the given type.
    61  func storeByType(t *types.Type) obj.As {
    62  	width := t.Size()
    63  	if t.IsFloat() {
    64  		switch width {
    65  		case 4:
    66  			return x86.AMOVSS
    67  		case 8:
    68  			return x86.AMOVSD
    69  		}
    70  	} else {
    71  		switch width {
    72  		case 1:
    73  			return x86.AMOVB
    74  		case 2:
    75  			return x86.AMOVW
    76  		case 4:
    77  			return x86.AMOVL
    78  		case 8:
    79  			return x86.AMOVQ
    80  		case 16:
    81  			return x86.AMOVUPS
    82  		}
    83  	}
    84  	panic(fmt.Sprintf("bad store type %v", t))
    85  }
    86  
    87  // moveByType returns the reg->reg move instruction of the given type.
    88  func moveByType(t *types.Type) obj.As {
    89  	if t.IsFloat() {
    90  		// Moving the whole sse2 register is faster
    91  		// than moving just the correct low portion of it.
    92  		// There is no xmm->xmm move with 1 byte opcode,
    93  		// so use movups, which has 2 byte opcode.
    94  		return x86.AMOVUPS
    95  	} else {
    96  		switch t.Size() {
    97  		case 1:
    98  			// Avoids partial register write
    99  			return x86.AMOVL
   100  		case 2:
   101  			return x86.AMOVL
   102  		case 4:
   103  			return x86.AMOVL
   104  		case 8:
   105  			return x86.AMOVQ
   106  		case 16:
   107  			return x86.AMOVUPS // int128s are in SSE registers
   108  		default:
   109  			panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
   110  		}
   111  	}
   112  }
   113  
   114  // opregreg emits instructions for
   115  //
   116  //	dest := dest(To) op src(From)
   117  //
   118  // and also returns the created obj.Prog so it
   119  // may be further adjusted (offset, scale, etc).
   120  func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog {
   121  	p := s.Prog(op)
   122  	p.From.Type = obj.TYPE_REG
   123  	p.To.Type = obj.TYPE_REG
   124  	p.To.Reg = dest
   125  	p.From.Reg = src
   126  	return p
   127  }
   128  
   129  // memIdx fills out a as an indexed memory reference for v.
   130  // It assumes that the base register and the index register
   131  // are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
   132  // The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
   133  func memIdx(a *obj.Addr, v *ssa.Value) {
   134  	r, i := v.Args[0].Reg(), v.Args[1].Reg()
   135  	a.Type = obj.TYPE_MEM
   136  	a.Scale = v.Op.Scale()
   137  	if a.Scale == 1 && i == x86.REG_SP {
   138  		r, i = i, r
   139  	}
   140  	a.Reg = r
   141  	a.Index = i
   142  }
   143  
   144  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
   145  // See runtime/mkduff.go.
   146  func duffStart(size int64) int64 {
   147  	x, _ := duff(size)
   148  	return x
   149  }
   150  func duffAdj(size int64) int64 {
   151  	_, x := duff(size)
   152  	return x
   153  }
   154  
   155  // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
   156  // required to use the duffzero mechanism for a block of the given size.
   157  func duff(size int64) (int64, int64) {
   158  	if size < 32 || size > 1024 || size%dzClearStep != 0 {
   159  		panic("bad duffzero size")
   160  	}
   161  	steps := size / dzClearStep
   162  	blocks := steps / dzBlockLen
   163  	steps %= dzBlockLen
   164  	off := dzBlockSize * (dzBlocks - blocks)
   165  	var adj int64
   166  	if steps != 0 {
   167  		off -= dzLeaqSize
   168  		off -= dzMovSize * steps
   169  		adj -= dzClearStep * (dzBlockLen - steps)
   170  	}
   171  	return off, adj
   172  }
   173  
   174  func getgFromTLS(s *ssagen.State, r int16) {
   175  	// See the comments in cmd/internal/obj/x86/obj6.go
   176  	// near CanUse1InsnTLS for a detailed explanation of these instructions.
   177  	if x86.CanUse1InsnTLS(base.Ctxt) {
   178  		// MOVQ (TLS), r
   179  		p := s.Prog(x86.AMOVQ)
   180  		p.From.Type = obj.TYPE_MEM
   181  		p.From.Reg = x86.REG_TLS
   182  		p.To.Type = obj.TYPE_REG
   183  		p.To.Reg = r
   184  	} else {
   185  		// MOVQ TLS, r
   186  		// MOVQ (r)(TLS*1), r
   187  		p := s.Prog(x86.AMOVQ)
   188  		p.From.Type = obj.TYPE_REG
   189  		p.From.Reg = x86.REG_TLS
   190  		p.To.Type = obj.TYPE_REG
   191  		p.To.Reg = r
   192  		q := s.Prog(x86.AMOVQ)
   193  		q.From.Type = obj.TYPE_MEM
   194  		q.From.Reg = r
   195  		q.From.Index = x86.REG_TLS
   196  		q.From.Scale = 1
   197  		q.To.Type = obj.TYPE_REG
   198  		q.To.Reg = r
   199  	}
   200  }
   201  
   202  func ssaGenValue(s *ssagen.State, v *ssa.Value) {
   203  	switch v.Op {
   204  	case ssa.OpAMD64VFMADD231SD, ssa.OpAMD64VFMADD231SS:
   205  		p := s.Prog(v.Op.Asm())
   206  		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
   207  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   208  		p.AddRestSourceReg(v.Args[1].Reg())
   209  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   210  		r := v.Reg()
   211  		r1 := v.Args[0].Reg()
   212  		r2 := v.Args[1].Reg()
   213  		switch {
   214  		case r == r1:
   215  			p := s.Prog(v.Op.Asm())
   216  			p.From.Type = obj.TYPE_REG
   217  			p.From.Reg = r2
   218  			p.To.Type = obj.TYPE_REG
   219  			p.To.Reg = r
   220  		case r == r2:
   221  			p := s.Prog(v.Op.Asm())
   222  			p.From.Type = obj.TYPE_REG
   223  			p.From.Reg = r1
   224  			p.To.Type = obj.TYPE_REG
   225  			p.To.Reg = r
   226  		default:
   227  			var asm obj.As
   228  			if v.Op == ssa.OpAMD64ADDQ {
   229  				asm = x86.ALEAQ
   230  			} else {
   231  				asm = x86.ALEAL
   232  			}
   233  			p := s.Prog(asm)
   234  			p.From.Type = obj.TYPE_MEM
   235  			p.From.Reg = r1
   236  			p.From.Scale = 1
   237  			p.From.Index = r2
   238  			p.To.Type = obj.TYPE_REG
   239  			p.To.Reg = r
   240  		}
   241  	// 2-address opcode arithmetic
   242  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   243  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   244  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   245  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   246  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   247  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   248  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   249  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   250  		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
   251  		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
   252  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   253  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   254  		ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
   255  		ssa.OpAMD64POR, ssa.OpAMD64PXOR,
   256  		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
   257  		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
   258  		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
   259  		ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
   260  		ssa.OpAMD64PUNPCKLBW:
   261  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   262  
   263  	case ssa.OpAMD64PSHUFLW:
   264  		p := s.Prog(v.Op.Asm())
   265  		imm := v.AuxInt
   266  		if imm < 0 || imm > 255 {
   267  			v.Fatalf("Invalid source selection immediate")
   268  		}
   269  		p.From.Offset = imm
   270  		p.From.Type = obj.TYPE_CONST
   271  		p.AddRestSourceReg(v.Args[0].Reg())
   272  		p.To.Type = obj.TYPE_REG
   273  		p.To.Reg = v.Reg()
   274  
   275  	case ssa.OpAMD64PSHUFBbroadcast:
   276  		// PSHUFB with a control mask of zero copies byte 0 to all
   277  		// bytes in the register.
   278  		//
   279  		// X15 is always zero with ABIInternal.
   280  		if s.ABI != obj.ABIInternal {
   281  			// zero X15 manually
   282  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   283  		}
   284  
   285  		p := s.Prog(v.Op.Asm())
   286  		p.From.Type = obj.TYPE_REG
   287  		p.To.Type = obj.TYPE_REG
   288  		p.To.Reg = v.Reg()
   289  		p.From.Reg = x86.REG_X15
   290  
   291  	case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
   292  		p := s.Prog(v.Op.Asm())
   293  		lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
   294  		p.From.Type = obj.TYPE_REG
   295  		p.From.Reg = bits
   296  		p.To.Type = obj.TYPE_REG
   297  		p.To.Reg = lo
   298  		p.AddRestSourceReg(hi)
   299  
   300  	case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
   301  		ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
   302  		ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   303  		p := s.Prog(v.Op.Asm())
   304  		p.From.Type = obj.TYPE_REG
   305  		p.From.Reg = v.Args[0].Reg()
   306  		p.To.Type = obj.TYPE_REG
   307  		switch v.Op {
   308  		case ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   309  			p.To.Reg = v.Reg0()
   310  		default:
   311  			p.To.Reg = v.Reg()
   312  		}
   313  
   314  	case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
   315  		p := s.Prog(v.Op.Asm())
   316  		p.From.Type = obj.TYPE_REG
   317  		p.From.Reg = v.Args[0].Reg()
   318  		p.To.Type = obj.TYPE_REG
   319  		p.To.Reg = v.Reg()
   320  		p.AddRestSourceReg(v.Args[1].Reg())
   321  
   322  	case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ,
   323  		ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ,
   324  		ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ:
   325  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   326  		p.AddRestSourceReg(v.Args[0].Reg())
   327  
   328  	case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
   329  		ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload,
   330  		ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload:
   331  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   332  		m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
   333  		ssagen.AddAux(&m, v)
   334  		p.AddRestSource(m)
   335  
   336  	case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8,
   337  		ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8,
   338  		ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8,
   339  		ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8,
   340  		ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8,
   341  		ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8:
   342  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg())
   343  		m := obj.Addr{Type: obj.TYPE_MEM}
   344  		memIdx(&m, v)
   345  		ssagen.AddAux(&m, v)
   346  		p.AddRestSource(m)
   347  
   348  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   349  		// Arg[0] (the dividend) is in AX.
   350  		// Arg[1] (the divisor) can be in any other register.
   351  		// Result[0] (the quotient) is in AX.
   352  		// Result[1] (the remainder) is in DX.
   353  		r := v.Args[1].Reg()
   354  
   355  		// Zero extend dividend.
   356  		opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   357  
   358  		// Issue divide.
   359  		p := s.Prog(v.Op.Asm())
   360  		p.From.Type = obj.TYPE_REG
   361  		p.From.Reg = r
   362  
   363  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   364  		// Arg[0] (the dividend) is in AX.
   365  		// Arg[1] (the divisor) can be in any other register.
   366  		// Result[0] (the quotient) is in AX.
   367  		// Result[1] (the remainder) is in DX.
   368  		r := v.Args[1].Reg()
   369  
   370  		var opCMP, opNEG, opSXD obj.As
   371  		switch v.Op {
   372  		case ssa.OpAMD64DIVQ:
   373  			opCMP, opNEG, opSXD = x86.ACMPQ, x86.ANEGQ, x86.ACQO
   374  		case ssa.OpAMD64DIVL:
   375  			opCMP, opNEG, opSXD = x86.ACMPL, x86.ANEGL, x86.ACDQ
   376  		case ssa.OpAMD64DIVW:
   377  			opCMP, opNEG, opSXD = x86.ACMPW, x86.ANEGW, x86.ACWD
   378  		}
   379  
   380  		// CPU faults upon signed overflow, which occurs when the most
   381  		// negative int is divided by -1. Handle divide by -1 as a special case.
   382  		var j1, j2 *obj.Prog
   383  		if ssa.DivisionNeedsFixUp(v) {
   384  			c := s.Prog(opCMP)
   385  			c.From.Type = obj.TYPE_REG
   386  			c.From.Reg = r
   387  			c.To.Type = obj.TYPE_CONST
   388  			c.To.Offset = -1
   389  
   390  			// Divisor is not -1, proceed with normal division.
   391  			j1 = s.Prog(x86.AJNE)
   392  			j1.To.Type = obj.TYPE_BRANCH
   393  
   394  			// Divisor is -1, manually compute quotient and remainder via fixup code.
   395  			// n / -1 = -n
   396  			n1 := s.Prog(opNEG)
   397  			n1.To.Type = obj.TYPE_REG
   398  			n1.To.Reg = x86.REG_AX
   399  
   400  			// n % -1 == 0
   401  			opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   402  
   403  			// TODO(khr): issue only the -1 fixup code we need.
   404  			// For instance, if only the quotient is used, no point in zeroing the remainder.
   405  
   406  			// Skip over normal division.
   407  			j2 = s.Prog(obj.AJMP)
   408  			j2.To.Type = obj.TYPE_BRANCH
   409  		}
   410  
   411  		// Sign extend dividend and perform division.
   412  		p := s.Prog(opSXD)
   413  		if j1 != nil {
   414  			j1.To.SetTarget(p)
   415  		}
   416  		p = s.Prog(v.Op.Asm())
   417  		p.From.Type = obj.TYPE_REG
   418  		p.From.Reg = r
   419  
   420  		if j2 != nil {
   421  			j2.To.SetTarget(s.Pc())
   422  		}
   423  
   424  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
   425  		// the frontend rewrites constant division by 8/16/32 bit integers into
   426  		// HMUL by a constant
   427  		// SSA rewrites generate the 64 bit versions
   428  
   429  		// Arg[0] is already in AX as it's the only register we allow
   430  		// and DX is the only output we care about (the high bits)
   431  		p := s.Prog(v.Op.Asm())
   432  		p.From.Type = obj.TYPE_REG
   433  		p.From.Reg = v.Args[1].Reg()
   434  
   435  		// IMULB puts the high portion in AH instead of DL,
   436  		// so move it to DL for consistency
   437  		if v.Type.Size() == 1 {
   438  			m := s.Prog(x86.AMOVB)
   439  			m.From.Type = obj.TYPE_REG
   440  			m.From.Reg = x86.REG_AH
   441  			m.To.Type = obj.TYPE_REG
   442  			m.To.Reg = x86.REG_DX
   443  		}
   444  
   445  	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
   446  		// Arg[0] is already in AX as it's the only register we allow
   447  		// results lo in AX
   448  		p := s.Prog(v.Op.Asm())
   449  		p.From.Type = obj.TYPE_REG
   450  		p.From.Reg = v.Args[1].Reg()
   451  
   452  	case ssa.OpAMD64MULQU2:
   453  		// Arg[0] is already in AX as it's the only register we allow
   454  		// results hi in DX, lo in AX
   455  		p := s.Prog(v.Op.Asm())
   456  		p.From.Type = obj.TYPE_REG
   457  		p.From.Reg = v.Args[1].Reg()
   458  
   459  	case ssa.OpAMD64DIVQU2:
   460  		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
   461  		// results q in AX, r in DX
   462  		p := s.Prog(v.Op.Asm())
   463  		p.From.Type = obj.TYPE_REG
   464  		p.From.Reg = v.Args[2].Reg()
   465  
   466  	case ssa.OpAMD64AVGQU:
   467  		// compute (x+y)/2 unsigned.
   468  		// Do a 64-bit add, the overflow goes into the carry.
   469  		// Shift right once and pull the carry back into the 63rd bit.
   470  		p := s.Prog(x86.AADDQ)
   471  		p.From.Type = obj.TYPE_REG
   472  		p.To.Type = obj.TYPE_REG
   473  		p.To.Reg = v.Reg()
   474  		p.From.Reg = v.Args[1].Reg()
   475  		p = s.Prog(x86.ARCRQ)
   476  		p.From.Type = obj.TYPE_CONST
   477  		p.From.Offset = 1
   478  		p.To.Type = obj.TYPE_REG
   479  		p.To.Reg = v.Reg()
   480  
   481  	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
   482  		r := v.Reg0()
   483  		r0 := v.Args[0].Reg()
   484  		r1 := v.Args[1].Reg()
   485  		switch r {
   486  		case r0:
   487  			p := s.Prog(v.Op.Asm())
   488  			p.From.Type = obj.TYPE_REG
   489  			p.From.Reg = r1
   490  			p.To.Type = obj.TYPE_REG
   491  			p.To.Reg = r
   492  		case r1:
   493  			p := s.Prog(v.Op.Asm())
   494  			p.From.Type = obj.TYPE_REG
   495  			p.From.Reg = r0
   496  			p.To.Type = obj.TYPE_REG
   497  			p.To.Reg = r
   498  		default:
   499  			v.Fatalf("output not in same register as an input %s", v.LongString())
   500  		}
   501  
   502  	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
   503  		p := s.Prog(v.Op.Asm())
   504  		p.From.Type = obj.TYPE_REG
   505  		p.From.Reg = v.Args[1].Reg()
   506  		p.To.Type = obj.TYPE_REG
   507  		p.To.Reg = v.Reg0()
   508  
   509  	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
   510  		p := s.Prog(v.Op.Asm())
   511  		p.From.Type = obj.TYPE_CONST
   512  		p.From.Offset = v.AuxInt
   513  		p.To.Type = obj.TYPE_REG
   514  		p.To.Reg = v.Reg0()
   515  
   516  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   517  		r := v.Reg()
   518  		a := v.Args[0].Reg()
   519  		if r == a {
   520  			switch v.AuxInt {
   521  			case 1:
   522  				var asm obj.As
   523  				// Software optimization manual recommends add $1,reg.
   524  				// But inc/dec is 1 byte smaller. ICC always uses inc
   525  				// Clang/GCC choose depending on flags, but prefer add.
   526  				// Experiments show that inc/dec is both a little faster
   527  				// and make a binary a little smaller.
   528  				if v.Op == ssa.OpAMD64ADDQconst {
   529  					asm = x86.AINCQ
   530  				} else {
   531  					asm = x86.AINCL
   532  				}
   533  				p := s.Prog(asm)
   534  				p.To.Type = obj.TYPE_REG
   535  				p.To.Reg = r
   536  				return
   537  			case -1:
   538  				var asm obj.As
   539  				if v.Op == ssa.OpAMD64ADDQconst {
   540  					asm = x86.ADECQ
   541  				} else {
   542  					asm = x86.ADECL
   543  				}
   544  				p := s.Prog(asm)
   545  				p.To.Type = obj.TYPE_REG
   546  				p.To.Reg = r
   547  				return
   548  			case 0x80:
   549  				// 'SUBQ $-0x80, r' is shorter to encode than
   550  				// and functionally equivalent to 'ADDQ $0x80, r'.
   551  				asm := x86.ASUBL
   552  				if v.Op == ssa.OpAMD64ADDQconst {
   553  					asm = x86.ASUBQ
   554  				}
   555  				p := s.Prog(asm)
   556  				p.From.Type = obj.TYPE_CONST
   557  				p.From.Offset = -0x80
   558  				p.To.Type = obj.TYPE_REG
   559  				p.To.Reg = r
   560  				return
   561  
   562  			}
   563  			p := s.Prog(v.Op.Asm())
   564  			p.From.Type = obj.TYPE_CONST
   565  			p.From.Offset = v.AuxInt
   566  			p.To.Type = obj.TYPE_REG
   567  			p.To.Reg = r
   568  			return
   569  		}
   570  		var asm obj.As
   571  		if v.Op == ssa.OpAMD64ADDQconst {
   572  			asm = x86.ALEAQ
   573  		} else {
   574  			asm = x86.ALEAL
   575  		}
   576  		p := s.Prog(asm)
   577  		p.From.Type = obj.TYPE_MEM
   578  		p.From.Reg = a
   579  		p.From.Offset = v.AuxInt
   580  		p.To.Type = obj.TYPE_REG
   581  		p.To.Reg = r
   582  
   583  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
   584  		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
   585  		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
   586  		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
   587  		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
   588  		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
   589  		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
   590  		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
   591  		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
   592  		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
   593  		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
   594  		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
   595  		p := s.Prog(v.Op.Asm())
   596  		p.From.Type = obj.TYPE_REG
   597  		p.From.Reg = v.Args[1].Reg()
   598  		p.To.Type = obj.TYPE_REG
   599  		p.To.Reg = v.Reg()
   600  
   601  	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
   602  		// Flag condition: ^ZERO || PARITY
   603  		// Generate:
   604  		//   CMOV*NE  SRC,DST
   605  		//   CMOV*PS  SRC,DST
   606  		p := s.Prog(v.Op.Asm())
   607  		p.From.Type = obj.TYPE_REG
   608  		p.From.Reg = v.Args[1].Reg()
   609  		p.To.Type = obj.TYPE_REG
   610  		p.To.Reg = v.Reg()
   611  		var q *obj.Prog
   612  		if v.Op == ssa.OpAMD64CMOVQNEF {
   613  			q = s.Prog(x86.ACMOVQPS)
   614  		} else if v.Op == ssa.OpAMD64CMOVLNEF {
   615  			q = s.Prog(x86.ACMOVLPS)
   616  		} else {
   617  			q = s.Prog(x86.ACMOVWPS)
   618  		}
   619  		q.From.Type = obj.TYPE_REG
   620  		q.From.Reg = v.Args[1].Reg()
   621  		q.To.Type = obj.TYPE_REG
   622  		q.To.Reg = v.Reg()
   623  
   624  	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
   625  		// Flag condition: ZERO && !PARITY
   626  		// Generate:
   627  		//   MOV      SRC,TMP
   628  		//   CMOV*NE  DST,TMP
   629  		//   CMOV*PC  TMP,DST
   630  		//
   631  		// TODO(rasky): we could generate:
   632  		//   CMOV*NE  DST,SRC
   633  		//   CMOV*PC  SRC,DST
   634  		// But this requires a way for regalloc to know that SRC might be
   635  		// clobbered by this instruction.
   636  		t := v.RegTmp()
   637  		opregreg(s, moveByType(v.Type), t, v.Args[1].Reg())
   638  
   639  		p := s.Prog(v.Op.Asm())
   640  		p.From.Type = obj.TYPE_REG
   641  		p.From.Reg = v.Reg()
   642  		p.To.Type = obj.TYPE_REG
   643  		p.To.Reg = t
   644  		var q *obj.Prog
   645  		if v.Op == ssa.OpAMD64CMOVQEQF {
   646  			q = s.Prog(x86.ACMOVQPC)
   647  		} else if v.Op == ssa.OpAMD64CMOVLEQF {
   648  			q = s.Prog(x86.ACMOVLPC)
   649  		} else {
   650  			q = s.Prog(x86.ACMOVWPC)
   651  		}
   652  		q.From.Type = obj.TYPE_REG
   653  		q.From.Reg = t
   654  		q.To.Type = obj.TYPE_REG
   655  		q.To.Reg = v.Reg()
   656  
   657  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   658  		r := v.Reg()
   659  		p := s.Prog(v.Op.Asm())
   660  		p.From.Type = obj.TYPE_CONST
   661  		p.From.Offset = v.AuxInt
   662  		p.To.Type = obj.TYPE_REG
   663  		p.To.Reg = r
   664  		p.AddRestSourceReg(v.Args[0].Reg())
   665  
   666  	case ssa.OpAMD64ANDQconst:
   667  		asm := v.Op.Asm()
   668  		// If the constant is positive and fits into 32 bits, use ANDL.
   669  		// This saves a few bytes of encoding.
   670  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   671  			asm = x86.AANDL
   672  		}
   673  		p := s.Prog(asm)
   674  		p.From.Type = obj.TYPE_CONST
   675  		p.From.Offset = v.AuxInt
   676  		p.To.Type = obj.TYPE_REG
   677  		p.To.Reg = v.Reg()
   678  
   679  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   680  		ssa.OpAMD64ANDLconst,
   681  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   682  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   683  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   684  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   685  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   686  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   687  		p := s.Prog(v.Op.Asm())
   688  		p.From.Type = obj.TYPE_CONST
   689  		p.From.Offset = v.AuxInt
   690  		p.To.Type = obj.TYPE_REG
   691  		p.To.Reg = v.Reg()
   692  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   693  		r := v.Reg()
   694  		p := s.Prog(v.Op.Asm())
   695  		p.From.Type = obj.TYPE_REG
   696  		p.From.Reg = r
   697  		p.To.Type = obj.TYPE_REG
   698  		p.To.Reg = r
   699  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
   700  		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
   701  		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   702  		p := s.Prog(v.Op.Asm())
   703  		memIdx(&p.From, v)
   704  		o := v.Reg()
   705  		p.To.Type = obj.TYPE_REG
   706  		p.To.Reg = o
   707  		if v.AuxInt != 0 && v.Aux == nil {
   708  			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
   709  			switch v.Op {
   710  			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   711  				p = s.Prog(x86.ALEAQ)
   712  			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
   713  				p = s.Prog(x86.ALEAL)
   714  			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   715  				p = s.Prog(x86.ALEAW)
   716  			}
   717  			p.From.Type = obj.TYPE_MEM
   718  			p.From.Reg = o
   719  			p.To.Type = obj.TYPE_REG
   720  			p.To.Reg = o
   721  		}
   722  		ssagen.AddAux(&p.From, v)
   723  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
   724  		p := s.Prog(v.Op.Asm())
   725  		p.From.Type = obj.TYPE_MEM
   726  		p.From.Reg = v.Args[0].Reg()
   727  		ssagen.AddAux(&p.From, v)
   728  		p.To.Type = obj.TYPE_REG
   729  		p.To.Reg = v.Reg()
   730  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   731  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
   732  		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
   733  		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
   734  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   735  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   736  		// must account for that right here.
   737  		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
   738  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   739  		p := s.Prog(v.Op.Asm())
   740  		p.From.Type = obj.TYPE_REG
   741  		p.From.Reg = v.Args[0].Reg()
   742  		p.To.Type = obj.TYPE_CONST
   743  		p.To.Offset = v.AuxInt
   744  	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
   745  		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
   746  		ssa.OpAMD64BTSQconst,
   747  		ssa.OpAMD64BTCQconst,
   748  		ssa.OpAMD64BTRQconst:
   749  		op := v.Op
   750  		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
   751  			// Emit 32-bit version because it's shorter
   752  			op = ssa.OpAMD64BTLconst
   753  		}
   754  		p := s.Prog(op.Asm())
   755  		p.From.Type = obj.TYPE_CONST
   756  		p.From.Offset = v.AuxInt
   757  		p.To.Type = obj.TYPE_REG
   758  		p.To.Reg = v.Args[0].Reg()
   759  	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
   760  		p := s.Prog(v.Op.Asm())
   761  		p.From.Type = obj.TYPE_MEM
   762  		p.From.Reg = v.Args[0].Reg()
   763  		ssagen.AddAux(&p.From, v)
   764  		p.To.Type = obj.TYPE_REG
   765  		p.To.Reg = v.Args[1].Reg()
   766  	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
   767  		sc := v.AuxValAndOff()
   768  		p := s.Prog(v.Op.Asm())
   769  		p.From.Type = obj.TYPE_MEM
   770  		p.From.Reg = v.Args[0].Reg()
   771  		ssagen.AddAux2(&p.From, v, sc.Off64())
   772  		p.To.Type = obj.TYPE_CONST
   773  		p.To.Offset = sc.Val64()
   774  	case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1:
   775  		p := s.Prog(v.Op.Asm())
   776  		memIdx(&p.From, v)
   777  		ssagen.AddAux(&p.From, v)
   778  		p.To.Type = obj.TYPE_REG
   779  		p.To.Reg = v.Args[2].Reg()
   780  	case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1:
   781  		sc := v.AuxValAndOff()
   782  		p := s.Prog(v.Op.Asm())
   783  		memIdx(&p.From, v)
   784  		ssagen.AddAux2(&p.From, v, sc.Off64())
   785  		p.To.Type = obj.TYPE_CONST
   786  		p.To.Offset = sc.Val64()
   787  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   788  		x := v.Reg()
   789  
   790  		// If flags aren't live (indicated by v.Aux == nil),
   791  		// then we can rewrite MOV $0, AX into XOR AX, AX.
   792  		if v.AuxInt == 0 && v.Aux == nil {
   793  			opregreg(s, x86.AXORL, x, x)
   794  			break
   795  		}
   796  
   797  		asm := v.Op.Asm()
   798  		// Use MOVL to move a small constant into a register
   799  		// when the constant is positive and fits into 32 bits.
   800  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   801  			// The upper 32bit are zeroed automatically when using MOVL.
   802  			asm = x86.AMOVL
   803  		}
   804  		p := s.Prog(asm)
   805  		p.From.Type = obj.TYPE_CONST
   806  		p.From.Offset = v.AuxInt
   807  		p.To.Type = obj.TYPE_REG
   808  		p.To.Reg = x
   809  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   810  		x := v.Reg()
   811  		p := s.Prog(v.Op.Asm())
   812  		p.From.Type = obj.TYPE_FCONST
   813  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   814  		p.To.Type = obj.TYPE_REG
   815  		p.To.Reg = x
   816  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
   817  		ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
   818  		ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
   819  		p := s.Prog(v.Op.Asm())
   820  		p.From.Type = obj.TYPE_MEM
   821  		p.From.Reg = v.Args[0].Reg()
   822  		ssagen.AddAux(&p.From, v)
   823  		p.To.Type = obj.TYPE_REG
   824  		p.To.Reg = v.Reg()
   825  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
   826  		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2,
   827  		ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8:
   828  		p := s.Prog(v.Op.Asm())
   829  		memIdx(&p.From, v)
   830  		ssagen.AddAux(&p.From, v)
   831  		p.To.Type = obj.TYPE_REG
   832  		p.To.Reg = v.Reg()
   833  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
   834  		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
   835  		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
   836  		ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore:
   837  		p := s.Prog(v.Op.Asm())
   838  		p.From.Type = obj.TYPE_REG
   839  		p.From.Reg = v.Args[1].Reg()
   840  		p.To.Type = obj.TYPE_MEM
   841  		p.To.Reg = v.Args[0].Reg()
   842  		ssagen.AddAux(&p.To, v)
   843  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
   844  		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
   845  		ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
   846  		ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
   847  		ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
   848  		ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
   849  		ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8,
   850  		ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8:
   851  		p := s.Prog(v.Op.Asm())
   852  		p.From.Type = obj.TYPE_REG
   853  		p.From.Reg = v.Args[2].Reg()
   854  		memIdx(&p.To, v)
   855  		ssagen.AddAux(&p.To, v)
   856  	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
   857  		sc := v.AuxValAndOff()
   858  		off := sc.Off64()
   859  		val := sc.Val()
   860  		if val == 1 || val == -1 {
   861  			var asm obj.As
   862  			if v.Op == ssa.OpAMD64ADDQconstmodify {
   863  				if val == 1 {
   864  					asm = x86.AINCQ
   865  				} else {
   866  					asm = x86.ADECQ
   867  				}
   868  			} else {
   869  				if val == 1 {
   870  					asm = x86.AINCL
   871  				} else {
   872  					asm = x86.ADECL
   873  				}
   874  			}
   875  			p := s.Prog(asm)
   876  			p.To.Type = obj.TYPE_MEM
   877  			p.To.Reg = v.Args[0].Reg()
   878  			ssagen.AddAux2(&p.To, v, off)
   879  			break
   880  		}
   881  		fallthrough
   882  	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
   883  		ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
   884  		ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
   885  		sc := v.AuxValAndOff()
   886  		off := sc.Off64()
   887  		val := sc.Val64()
   888  		p := s.Prog(v.Op.Asm())
   889  		p.From.Type = obj.TYPE_CONST
   890  		p.From.Offset = val
   891  		p.To.Type = obj.TYPE_MEM
   892  		p.To.Reg = v.Args[0].Reg()
   893  		ssagen.AddAux2(&p.To, v, off)
   894  
   895  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   896  		p := s.Prog(v.Op.Asm())
   897  		p.From.Type = obj.TYPE_CONST
   898  		sc := v.AuxValAndOff()
   899  		p.From.Offset = sc.Val64()
   900  		p.To.Type = obj.TYPE_MEM
   901  		p.To.Reg = v.Args[0].Reg()
   902  		ssagen.AddAux2(&p.To, v, sc.Off64())
   903  	case ssa.OpAMD64MOVOstoreconst:
   904  		sc := v.AuxValAndOff()
   905  		if sc.Val() != 0 {
   906  			v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString())
   907  		}
   908  
   909  		if s.ABI != obj.ABIInternal {
   910  			// zero X15 manually
   911  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   912  		}
   913  		p := s.Prog(v.Op.Asm())
   914  		p.From.Type = obj.TYPE_REG
   915  		p.From.Reg = x86.REG_X15
   916  		p.To.Type = obj.TYPE_MEM
   917  		p.To.Reg = v.Args[0].Reg()
   918  		ssagen.AddAux2(&p.To, v, sc.Off64())
   919  
   920  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
   921  		ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
   922  		ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
   923  		ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
   924  		ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
   925  		p := s.Prog(v.Op.Asm())
   926  		p.From.Type = obj.TYPE_CONST
   927  		sc := v.AuxValAndOff()
   928  		p.From.Offset = sc.Val64()
   929  		switch {
   930  		case p.As == x86.AADDQ && p.From.Offset == 1:
   931  			p.As = x86.AINCQ
   932  			p.From.Type = obj.TYPE_NONE
   933  		case p.As == x86.AADDQ && p.From.Offset == -1:
   934  			p.As = x86.ADECQ
   935  			p.From.Type = obj.TYPE_NONE
   936  		case p.As == x86.AADDL && p.From.Offset == 1:
   937  			p.As = x86.AINCL
   938  			p.From.Type = obj.TYPE_NONE
   939  		case p.As == x86.AADDL && p.From.Offset == -1:
   940  			p.As = x86.ADECL
   941  			p.From.Type = obj.TYPE_NONE
   942  		}
   943  		memIdx(&p.To, v)
   944  		ssagen.AddAux2(&p.To, v, sc.Off64())
   945  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   946  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   947  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
   948  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
   949  	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
   950  		r := v.Reg()
   951  		// Break false dependency on destination register.
   952  		opregreg(s, x86.AXORPS, r, r)
   953  		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
   954  	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   955  		var p *obj.Prog
   956  		switch v.Op {
   957  		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
   958  			p = s.Prog(x86.AMOVQ)
   959  		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   960  			p = s.Prog(x86.AMOVL)
   961  		}
   962  		p.From.Type = obj.TYPE_REG
   963  		p.From.Reg = v.Args[0].Reg()
   964  		p.To.Type = obj.TYPE_REG
   965  		p.To.Reg = v.Reg()
   966  	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
   967  		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
   968  		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
   969  		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
   970  		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
   971  		p := s.Prog(v.Op.Asm())
   972  		p.From.Type = obj.TYPE_MEM
   973  		p.From.Reg = v.Args[1].Reg()
   974  		ssagen.AddAux(&p.From, v)
   975  		p.To.Type = obj.TYPE_REG
   976  		p.To.Reg = v.Reg()
   977  	case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
   978  		ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
   979  		ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
   980  		ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
   981  		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
   982  		ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
   983  		ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
   984  		ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
   985  		ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
   986  		p := s.Prog(v.Op.Asm())
   987  
   988  		r, i := v.Args[1].Reg(), v.Args[2].Reg()
   989  		p.From.Type = obj.TYPE_MEM
   990  		p.From.Scale = v.Op.Scale()
   991  		if p.From.Scale == 1 && i == x86.REG_SP {
   992  			r, i = i, r
   993  		}
   994  		p.From.Reg = r
   995  		p.From.Index = i
   996  
   997  		ssagen.AddAux(&p.From, v)
   998  		p.To.Type = obj.TYPE_REG
   999  		p.To.Reg = v.Reg()
  1000  	case ssa.OpAMD64DUFFZERO:
  1001  		if s.ABI != obj.ABIInternal {
  1002  			// zero X15 manually
  1003  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1004  		}
  1005  		off := duffStart(v.AuxInt)
  1006  		adj := duffAdj(v.AuxInt)
  1007  		var p *obj.Prog
  1008  		if adj != 0 {
  1009  			p = s.Prog(x86.ALEAQ)
  1010  			p.From.Type = obj.TYPE_MEM
  1011  			p.From.Offset = adj
  1012  			p.From.Reg = x86.REG_DI
  1013  			p.To.Type = obj.TYPE_REG
  1014  			p.To.Reg = x86.REG_DI
  1015  		}
  1016  		p = s.Prog(obj.ADUFFZERO)
  1017  		p.To.Type = obj.TYPE_ADDR
  1018  		p.To.Sym = ir.Syms.Duffzero
  1019  		p.To.Offset = off
  1020  	case ssa.OpAMD64DUFFCOPY:
  1021  		p := s.Prog(obj.ADUFFCOPY)
  1022  		p.To.Type = obj.TYPE_ADDR
  1023  		p.To.Sym = ir.Syms.Duffcopy
  1024  		if v.AuxInt%16 != 0 {
  1025  			v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
  1026  		}
  1027  		p.To.Offset = 14 * (64 - v.AuxInt/16)
  1028  		// 14 and 64 are magic constants.  14 is the number of bytes to encode:
  1029  		//	MOVUPS	(SI), X0
  1030  		//	ADDQ	$16, SI
  1031  		//	MOVUPS	X0, (DI)
  1032  		//	ADDQ	$16, DI
  1033  		// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
  1034  
  1035  	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
  1036  		if v.Type.IsMemory() {
  1037  			return
  1038  		}
  1039  		x := v.Args[0].Reg()
  1040  		y := v.Reg()
  1041  		if x != y {
  1042  			opregreg(s, moveByType(v.Type), y, x)
  1043  		}
  1044  	case ssa.OpLoadReg:
  1045  		if v.Type.IsFlags() {
  1046  			v.Fatalf("load flags not implemented: %v", v.LongString())
  1047  			return
  1048  		}
  1049  		p := s.Prog(loadByType(v.Type))
  1050  		ssagen.AddrAuto(&p.From, v.Args[0])
  1051  		p.To.Type = obj.TYPE_REG
  1052  		p.To.Reg = v.Reg()
  1053  
  1054  	case ssa.OpStoreReg:
  1055  		if v.Type.IsFlags() {
  1056  			v.Fatalf("store flags not implemented: %v", v.LongString())
  1057  			return
  1058  		}
  1059  		p := s.Prog(storeByType(v.Type))
  1060  		p.From.Type = obj.TYPE_REG
  1061  		p.From.Reg = v.Args[0].Reg()
  1062  		ssagen.AddrAuto(&p.To, v)
  1063  	case ssa.OpAMD64LoweredHasCPUFeature:
  1064  		p := s.Prog(x86.AMOVBLZX)
  1065  		p.From.Type = obj.TYPE_MEM
  1066  		ssagen.AddAux(&p.From, v)
  1067  		p.To.Type = obj.TYPE_REG
  1068  		p.To.Reg = v.Reg()
  1069  	case ssa.OpArgIntReg, ssa.OpArgFloatReg:
  1070  		// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
  1071  		// The loop only runs once.
  1072  		for _, ap := range v.Block.Func.RegArgs {
  1073  			// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
  1074  			addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
  1075  			s.FuncInfo().AddSpill(
  1076  				obj.RegSpill{Reg: ap.Reg, Addr: addr, Unspill: loadByType(ap.Type), Spill: storeByType(ap.Type)})
  1077  		}
  1078  		v.Block.Func.RegArgs = nil
  1079  		ssagen.CheckArgReg(v)
  1080  	case ssa.OpAMD64LoweredGetClosurePtr:
  1081  		// Closure pointer is DX.
  1082  		ssagen.CheckLoweredGetClosurePtr(v)
  1083  	case ssa.OpAMD64LoweredGetG:
  1084  		if s.ABI == obj.ABIInternal {
  1085  			v.Fatalf("LoweredGetG should not appear in ABIInternal")
  1086  		}
  1087  		r := v.Reg()
  1088  		getgFromTLS(s, r)
  1089  	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
  1090  		if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
  1091  			// zeroing X15 when entering ABIInternal from ABI0
  1092  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1093  			// set G register from TLS
  1094  			getgFromTLS(s, x86.REG_R14)
  1095  		}
  1096  		if v.Op == ssa.OpAMD64CALLtail {
  1097  			s.TailCall(v)
  1098  			break
  1099  		}
  1100  		s.Call(v)
  1101  		if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
  1102  			// zeroing X15 when entering ABIInternal from ABI0
  1103  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1104  			// set G register from TLS
  1105  			getgFromTLS(s, x86.REG_R14)
  1106  		}
  1107  	case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
  1108  		s.Call(v)
  1109  
  1110  	case ssa.OpAMD64LoweredGetCallerPC:
  1111  		p := s.Prog(x86.AMOVQ)
  1112  		p.From.Type = obj.TYPE_MEM
  1113  		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
  1114  		p.From.Name = obj.NAME_PARAM
  1115  		p.To.Type = obj.TYPE_REG
  1116  		p.To.Reg = v.Reg()
  1117  
  1118  	case ssa.OpAMD64LoweredGetCallerSP:
  1119  		// caller's SP is the address of the first arg
  1120  		mov := x86.AMOVQ
  1121  		if types.PtrSize == 4 {
  1122  			mov = x86.AMOVL
  1123  		}
  1124  		p := s.Prog(mov)
  1125  		p.From.Type = obj.TYPE_ADDR
  1126  		p.From.Offset = -base.Ctxt.Arch.FixedFrameSize // 0 on amd64, just to be consistent with other architectures
  1127  		p.From.Name = obj.NAME_PARAM
  1128  		p.To.Type = obj.TYPE_REG
  1129  		p.To.Reg = v.Reg()
  1130  
  1131  	case ssa.OpAMD64LoweredWB:
  1132  		p := s.Prog(obj.ACALL)
  1133  		p.To.Type = obj.TYPE_MEM
  1134  		p.To.Name = obj.NAME_EXTERN
  1135  		// AuxInt encodes how many buffer entries we need.
  1136  		p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
  1137  
  1138  	case ssa.OpAMD64LoweredPanicBoundsA, ssa.OpAMD64LoweredPanicBoundsB, ssa.OpAMD64LoweredPanicBoundsC:
  1139  		p := s.Prog(obj.ACALL)
  1140  		p.To.Type = obj.TYPE_MEM
  1141  		p.To.Name = obj.NAME_EXTERN
  1142  		p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt]
  1143  		s.UseArgs(int64(2 * types.PtrSize)) // space used in callee args area by assembly stubs
  1144  
  1145  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
  1146  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
  1147  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
  1148  		p := s.Prog(v.Op.Asm())
  1149  		p.To.Type = obj.TYPE_REG
  1150  		p.To.Reg = v.Reg()
  1151  
  1152  	case ssa.OpAMD64NEGLflags:
  1153  		p := s.Prog(v.Op.Asm())
  1154  		p.To.Type = obj.TYPE_REG
  1155  		p.To.Reg = v.Reg0()
  1156  
  1157  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1158  		p := s.Prog(v.Op.Asm())
  1159  		p.From.Type = obj.TYPE_REG
  1160  		p.From.Reg = v.Args[0].Reg()
  1161  		p.To.Type = obj.TYPE_REG
  1162  		switch v.Op {
  1163  		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
  1164  			p.To.Reg = v.Reg0()
  1165  		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1166  			p.To.Reg = v.Reg()
  1167  		}
  1168  	case ssa.OpAMD64LoweredRound32F, ssa.OpAMD64LoweredRound64F:
  1169  		// input is already rounded
  1170  	case ssa.OpAMD64ROUNDSD:
  1171  		p := s.Prog(v.Op.Asm())
  1172  		val := v.AuxInt
  1173  		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
  1174  		if val < 0 || val > 3 {
  1175  			v.Fatalf("Invalid rounding mode")
  1176  		}
  1177  		p.From.Offset = val
  1178  		p.From.Type = obj.TYPE_CONST
  1179  		p.AddRestSourceReg(v.Args[0].Reg())
  1180  		p.To.Type = obj.TYPE_REG
  1181  		p.To.Reg = v.Reg()
  1182  	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
  1183  		ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
  1184  		ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
  1185  		if v.Args[0].Reg() != v.Reg() {
  1186  			// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
  1187  			// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
  1188  			// Xor register with itself to break the dependency.
  1189  			opregreg(s, x86.AXORL, v.Reg(), v.Reg())
  1190  		}
  1191  		p := s.Prog(v.Op.Asm())
  1192  		p.From.Type = obj.TYPE_REG
  1193  		p.From.Reg = v.Args[0].Reg()
  1194  		p.To.Type = obj.TYPE_REG
  1195  		p.To.Reg = v.Reg()
  1196  
  1197  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
  1198  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
  1199  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
  1200  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
  1201  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
  1202  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
  1203  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
  1204  		ssa.OpAMD64SETO:
  1205  		p := s.Prog(v.Op.Asm())
  1206  		p.To.Type = obj.TYPE_REG
  1207  		p.To.Reg = v.Reg()
  1208  
  1209  	case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
  1210  		ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
  1211  		ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
  1212  		ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
  1213  		ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
  1214  		p := s.Prog(v.Op.Asm())
  1215  		p.To.Type = obj.TYPE_MEM
  1216  		p.To.Reg = v.Args[0].Reg()
  1217  		ssagen.AddAux(&p.To, v)
  1218  
  1219  	case ssa.OpAMD64SETEQstoreidx1, ssa.OpAMD64SETNEstoreidx1,
  1220  		ssa.OpAMD64SETLstoreidx1, ssa.OpAMD64SETLEstoreidx1,
  1221  		ssa.OpAMD64SETGstoreidx1, ssa.OpAMD64SETGEstoreidx1,
  1222  		ssa.OpAMD64SETBstoreidx1, ssa.OpAMD64SETBEstoreidx1,
  1223  		ssa.OpAMD64SETAstoreidx1, ssa.OpAMD64SETAEstoreidx1:
  1224  		p := s.Prog(v.Op.Asm())
  1225  		memIdx(&p.To, v)
  1226  		ssagen.AddAux(&p.To, v)
  1227  
  1228  	case ssa.OpAMD64SETNEF:
  1229  		t := v.RegTmp()
  1230  		p := s.Prog(v.Op.Asm())
  1231  		p.To.Type = obj.TYPE_REG
  1232  		p.To.Reg = v.Reg()
  1233  		q := s.Prog(x86.ASETPS)
  1234  		q.To.Type = obj.TYPE_REG
  1235  		q.To.Reg = t
  1236  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
  1237  		opregreg(s, x86.AORL, v.Reg(), t)
  1238  
  1239  	case ssa.OpAMD64SETEQF:
  1240  		t := v.RegTmp()
  1241  		p := s.Prog(v.Op.Asm())
  1242  		p.To.Type = obj.TYPE_REG
  1243  		p.To.Reg = v.Reg()
  1244  		q := s.Prog(x86.ASETPC)
  1245  		q.To.Type = obj.TYPE_REG
  1246  		q.To.Reg = t
  1247  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
  1248  		opregreg(s, x86.AANDL, v.Reg(), t)
  1249  
  1250  	case ssa.OpAMD64InvertFlags:
  1251  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1252  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
  1253  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1254  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
  1255  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
  1256  	case ssa.OpAMD64REPSTOSQ:
  1257  		s.Prog(x86.AREP)
  1258  		s.Prog(x86.ASTOSQ)
  1259  	case ssa.OpAMD64REPMOVSQ:
  1260  		s.Prog(x86.AREP)
  1261  		s.Prog(x86.AMOVSQ)
  1262  	case ssa.OpAMD64LoweredNilCheck:
  1263  		// Issue a load which will fault if the input is nil.
  1264  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
  1265  		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
  1266  		// but it doesn't have false dependency on AX.
  1267  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
  1268  		// That trades clobbering flags for clobbering a register.
  1269  		p := s.Prog(x86.ATESTB)
  1270  		p.From.Type = obj.TYPE_REG
  1271  		p.From.Reg = x86.REG_AX
  1272  		p.To.Type = obj.TYPE_MEM
  1273  		p.To.Reg = v.Args[0].Reg()
  1274  		if logopt.Enabled() {
  1275  			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
  1276  		}
  1277  		if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
  1278  			base.WarnfAt(v.Pos, "generated nil check")
  1279  		}
  1280  	case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
  1281  		p := s.Prog(v.Op.Asm())
  1282  		p.From.Type = obj.TYPE_MEM
  1283  		p.From.Reg = v.Args[0].Reg()
  1284  		ssagen.AddAux(&p.From, v)
  1285  		p.To.Type = obj.TYPE_REG
  1286  		p.To.Reg = v.Reg0()
  1287  	case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
  1288  		p := s.Prog(v.Op.Asm())
  1289  		p.From.Type = obj.TYPE_REG
  1290  		p.From.Reg = v.Reg0()
  1291  		p.To.Type = obj.TYPE_MEM
  1292  		p.To.Reg = v.Args[1].Reg()
  1293  		ssagen.AddAux(&p.To, v)
  1294  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1295  		s.Prog(x86.ALOCK)
  1296  		p := s.Prog(v.Op.Asm())
  1297  		p.From.Type = obj.TYPE_REG
  1298  		p.From.Reg = v.Reg0()
  1299  		p.To.Type = obj.TYPE_MEM
  1300  		p.To.Reg = v.Args[1].Reg()
  1301  		ssagen.AddAux(&p.To, v)
  1302  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1303  		if v.Args[1].Reg() != x86.REG_AX {
  1304  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1305  		}
  1306  		s.Prog(x86.ALOCK)
  1307  		p := s.Prog(v.Op.Asm())
  1308  		p.From.Type = obj.TYPE_REG
  1309  		p.From.Reg = v.Args[2].Reg()
  1310  		p.To.Type = obj.TYPE_MEM
  1311  		p.To.Reg = v.Args[0].Reg()
  1312  		ssagen.AddAux(&p.To, v)
  1313  		p = s.Prog(x86.ASETEQ)
  1314  		p.To.Type = obj.TYPE_REG
  1315  		p.To.Reg = v.Reg0()
  1316  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock:
  1317  		// Atomic memory operations that don't need to return the old value.
  1318  		s.Prog(x86.ALOCK)
  1319  		p := s.Prog(v.Op.Asm())
  1320  		p.From.Type = obj.TYPE_REG
  1321  		p.From.Reg = v.Args[1].Reg()
  1322  		p.To.Type = obj.TYPE_MEM
  1323  		p.To.Reg = v.Args[0].Reg()
  1324  		ssagen.AddAux(&p.To, v)
  1325  	case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32:
  1326  		// Atomic memory operations that need to return the old value.
  1327  		// We need to do these with compare-and-exchange to get access to the old value.
  1328  		// loop:
  1329  		// MOVQ mask, tmp
  1330  		// MOVQ (addr), AX
  1331  		// ANDQ AX, tmp
  1332  		// LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against
  1333  		// JNE loop
  1334  		// : result in AX
  1335  		mov := x86.AMOVQ
  1336  		op := x86.AANDQ
  1337  		cmpxchg := x86.ACMPXCHGQ
  1338  		switch v.Op {
  1339  		case ssa.OpAMD64LoweredAtomicOr64:
  1340  			op = x86.AORQ
  1341  		case ssa.OpAMD64LoweredAtomicAnd32:
  1342  			mov = x86.AMOVL
  1343  			op = x86.AANDL
  1344  			cmpxchg = x86.ACMPXCHGL
  1345  		case ssa.OpAMD64LoweredAtomicOr32:
  1346  			mov = x86.AMOVL
  1347  			op = x86.AORL
  1348  			cmpxchg = x86.ACMPXCHGL
  1349  		}
  1350  		addr := v.Args[0].Reg()
  1351  		mask := v.Args[1].Reg()
  1352  		tmp := v.RegTmp()
  1353  		p1 := s.Prog(mov)
  1354  		p1.From.Type = obj.TYPE_REG
  1355  		p1.From.Reg = mask
  1356  		p1.To.Type = obj.TYPE_REG
  1357  		p1.To.Reg = tmp
  1358  		p2 := s.Prog(mov)
  1359  		p2.From.Type = obj.TYPE_MEM
  1360  		p2.From.Reg = addr
  1361  		ssagen.AddAux(&p2.From, v)
  1362  		p2.To.Type = obj.TYPE_REG
  1363  		p2.To.Reg = x86.REG_AX
  1364  		p3 := s.Prog(op)
  1365  		p3.From.Type = obj.TYPE_REG
  1366  		p3.From.Reg = x86.REG_AX
  1367  		p3.To.Type = obj.TYPE_REG
  1368  		p3.To.Reg = tmp
  1369  		s.Prog(x86.ALOCK)
  1370  		p5 := s.Prog(cmpxchg)
  1371  		p5.From.Type = obj.TYPE_REG
  1372  		p5.From.Reg = tmp
  1373  		p5.To.Type = obj.TYPE_MEM
  1374  		p5.To.Reg = addr
  1375  		ssagen.AddAux(&p5.To, v)
  1376  		p6 := s.Prog(x86.AJNE)
  1377  		p6.To.Type = obj.TYPE_BRANCH
  1378  		p6.To.SetTarget(p1)
  1379  	case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
  1380  		p := s.Prog(v.Op.Asm())
  1381  		p.From.Type = obj.TYPE_MEM
  1382  		p.From.Reg = v.Args[0].Reg()
  1383  	case ssa.OpClobber:
  1384  		p := s.Prog(x86.AMOVL)
  1385  		p.From.Type = obj.TYPE_CONST
  1386  		p.From.Offset = 0xdeaddead
  1387  		p.To.Type = obj.TYPE_MEM
  1388  		p.To.Reg = x86.REG_SP
  1389  		ssagen.AddAux(&p.To, v)
  1390  		p = s.Prog(x86.AMOVL)
  1391  		p.From.Type = obj.TYPE_CONST
  1392  		p.From.Offset = 0xdeaddead
  1393  		p.To.Type = obj.TYPE_MEM
  1394  		p.To.Reg = x86.REG_SP
  1395  		ssagen.AddAux(&p.To, v)
  1396  		p.To.Offset += 4
  1397  	case ssa.OpClobberReg:
  1398  		x := uint64(0xdeaddeaddeaddead)
  1399  		p := s.Prog(x86.AMOVQ)
  1400  		p.From.Type = obj.TYPE_CONST
  1401  		p.From.Offset = int64(x)
  1402  		p.To.Type = obj.TYPE_REG
  1403  		p.To.Reg = v.Reg()
  1404  	default:
  1405  		v.Fatalf("genValue not implemented: %s", v.LongString())
  1406  	}
  1407  }
  1408  
  1409  var blockJump = [...]struct {
  1410  	asm, invasm obj.As
  1411  }{
  1412  	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
  1413  	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
  1414  	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
  1415  	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
  1416  	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
  1417  	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
  1418  	ssa.BlockAMD64OS:  {x86.AJOS, x86.AJOC},
  1419  	ssa.BlockAMD64OC:  {x86.AJOC, x86.AJOS},
  1420  	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
  1421  	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
  1422  	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
  1423  	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
  1424  	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
  1425  	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
  1426  }
  1427  
  1428  var eqfJumps = [2][2]ssagen.IndexJump{
  1429  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
  1430  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
  1431  }
  1432  var nefJumps = [2][2]ssagen.IndexJump{
  1433  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
  1434  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
  1435  }
  1436  
  1437  func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
  1438  	switch b.Kind {
  1439  	case ssa.BlockPlain, ssa.BlockDefer:
  1440  		if b.Succs[0].Block() != next {
  1441  			p := s.Prog(obj.AJMP)
  1442  			p.To.Type = obj.TYPE_BRANCH
  1443  			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
  1444  		}
  1445  	case ssa.BlockExit, ssa.BlockRetJmp:
  1446  	case ssa.BlockRet:
  1447  		s.Prog(obj.ARET)
  1448  
  1449  	case ssa.BlockAMD64EQF:
  1450  		s.CombJump(b, next, &eqfJumps)
  1451  
  1452  	case ssa.BlockAMD64NEF:
  1453  		s.CombJump(b, next, &nefJumps)
  1454  
  1455  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  1456  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  1457  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  1458  		ssa.BlockAMD64OS, ssa.BlockAMD64OC,
  1459  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  1460  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  1461  		jmp := blockJump[b.Kind]
  1462  		switch next {
  1463  		case b.Succs[0].Block():
  1464  			s.Br(jmp.invasm, b.Succs[1].Block())
  1465  		case b.Succs[1].Block():
  1466  			s.Br(jmp.asm, b.Succs[0].Block())
  1467  		default:
  1468  			if b.Likely != ssa.BranchUnlikely {
  1469  				s.Br(jmp.asm, b.Succs[0].Block())
  1470  				s.Br(obj.AJMP, b.Succs[1].Block())
  1471  			} else {
  1472  				s.Br(jmp.invasm, b.Succs[1].Block())
  1473  				s.Br(obj.AJMP, b.Succs[0].Block())
  1474  			}
  1475  		}
  1476  
  1477  	case ssa.BlockAMD64JUMPTABLE:
  1478  		// JMP      *(TABLE)(INDEX*8)
  1479  		p := s.Prog(obj.AJMP)
  1480  		p.To.Type = obj.TYPE_MEM
  1481  		p.To.Reg = b.Controls[1].Reg()
  1482  		p.To.Index = b.Controls[0].Reg()
  1483  		p.To.Scale = 8
  1484  		// Save jump tables for later resolution of the target blocks.
  1485  		s.JumpTables = append(s.JumpTables, b)
  1486  
  1487  	default:
  1488  		b.Fatalf("branch not implemented: %s", b.LongString())
  1489  	}
  1490  }
  1491  
  1492  func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  1493  	p := s.Prog(loadByType(t))
  1494  	p.From.Type = obj.TYPE_MEM
  1495  	p.From.Name = obj.NAME_AUTO
  1496  	p.From.Sym = n.Linksym()
  1497  	p.From.Offset = n.FrameOffset() + off
  1498  	p.To.Type = obj.TYPE_REG
  1499  	p.To.Reg = reg
  1500  	return p
  1501  }
  1502  
  1503  func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  1504  	p = pp.Append(p, storeByType(t), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
  1505  	p.To.Name = obj.NAME_PARAM
  1506  	p.To.Sym = n.Linksym()
  1507  	p.Pos = p.Pos.WithNotStmt()
  1508  	return p
  1509  }
  1510  

View as plain text