Source file src/cmd/compile/internal/ssagen/intrinsics.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssagen
     6  
     7  import (
     8  	"fmt"
     9  	"internal/abi"
    10  	"internal/buildcfg"
    11  
    12  	"cmd/compile/internal/base"
    13  	"cmd/compile/internal/ir"
    14  	"cmd/compile/internal/ssa"
    15  	"cmd/compile/internal/typecheck"
    16  	"cmd/compile/internal/types"
    17  	"cmd/internal/sys"
    18  )
    19  
    20  var intrinsics intrinsicBuilders
    21  
    22  // An intrinsicBuilder converts a call node n into an ssa value that
    23  // implements that call as an intrinsic. args is a list of arguments to the func.
    24  type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
    25  
    26  type intrinsicKey struct {
    27  	arch *sys.Arch
    28  	pkg  string
    29  	fn   string
    30  }
    31  
    32  // intrinsicBuildConfig specifies the config to use for intrinsic building.
    33  type intrinsicBuildConfig struct {
    34  	instrumenting bool
    35  
    36  	go386     string
    37  	goamd64   int
    38  	goarm     buildcfg.GoarmFeatures
    39  	goarm64   buildcfg.Goarm64Features
    40  	gomips    string
    41  	gomips64  string
    42  	goppc64   int
    43  	goriscv64 int
    44  }
    45  
    46  type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
    47  
    48  // add adds the intrinsic builder b for pkg.fn for the given architecture.
    49  func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
    50  	if _, found := ib[intrinsicKey{arch, pkg, fn}]; found {
    51  		panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name))
    52  	}
    53  	ib[intrinsicKey{arch, pkg, fn}] = b
    54  }
    55  
    56  // addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
    57  func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
    58  	for _, arch := range archs {
    59  		ib.add(arch, pkg, fn, b)
    60  	}
    61  }
    62  
    63  // addForFamilies does the same as addForArchs but operates on architecture families.
    64  func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
    65  	for _, arch := range sys.Archs {
    66  		if arch.InFamily(archFamilies...) {
    67  			intrinsics.add(arch, pkg, fn, b)
    68  		}
    69  	}
    70  }
    71  
    72  // alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
    73  // for which targetPkg.targetFn already exists.
    74  func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
    75  	// TODO(jsing): Consider making this work even if the alias is added
    76  	// before the intrinsic.
    77  	aliased := false
    78  	for _, arch := range archs {
    79  		if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
    80  			intrinsics.add(arch, pkg, fn, b)
    81  			aliased = true
    82  		}
    83  	}
    84  	if !aliased {
    85  		panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
    86  	}
    87  }
    88  
    89  // lookup looks up the intrinsic for a pkg.fn on the specified architecture.
    90  func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
    91  	return intrinsics[intrinsicKey{arch, pkg, fn}]
    92  }
    93  
    94  func initIntrinsics(cfg *intrinsicBuildConfig) {
    95  	if cfg == nil {
    96  		cfg = &intrinsicBuildConfig{
    97  			instrumenting: base.Flag.Cfg.Instrumenting,
    98  			go386:         buildcfg.GO386,
    99  			goamd64:       buildcfg.GOAMD64,
   100  			goarm:         buildcfg.GOARM,
   101  			goarm64:       buildcfg.GOARM64,
   102  			gomips:        buildcfg.GOMIPS,
   103  			gomips64:      buildcfg.GOMIPS64,
   104  			goppc64:       buildcfg.GOPPC64,
   105  			goriscv64:     buildcfg.GORISCV64,
   106  		}
   107  	}
   108  	intrinsics = intrinsicBuilders{}
   109  
   110  	var p4 []*sys.Arch
   111  	var p8 []*sys.Arch
   112  	var lwatomics []*sys.Arch
   113  	for _, a := range sys.Archs {
   114  		if a.PtrSize == 4 {
   115  			p4 = append(p4, a)
   116  		} else {
   117  			p8 = append(p8, a)
   118  		}
   119  		if a.Family != sys.PPC64 {
   120  			lwatomics = append(lwatomics, a)
   121  		}
   122  	}
   123  	all := sys.Archs[:]
   124  
   125  	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
   126  		intrinsics.addForArchs(pkg, fn, b, archs...)
   127  	}
   128  	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
   129  		intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
   130  	}
   131  	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
   132  		intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
   133  	}
   134  
   135  	/******** runtime ********/
   136  	if !cfg.instrumenting {
   137  		add("runtime", "slicebytetostringtmp",
   138  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   139  				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
   140  				// for the backend instead of slicebytetostringtmp calls
   141  				// when not instrumenting.
   142  				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
   143  			},
   144  			all...)
   145  	}
   146  	addF("internal/runtime/math", "MulUintptr",
   147  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   148  			if s.config.PtrSize == 4 {
   149  				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   150  			}
   151  			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   152  		},
   153  		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64)
   154  	add("runtime", "KeepAlive",
   155  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   156  			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
   157  			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
   158  			return nil
   159  		},
   160  		all...)
   161  
   162  	addF("runtime", "publicationBarrier",
   163  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   164  			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
   165  			return nil
   166  		},
   167  		sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64)
   168  
   169  	/******** internal/runtime/sys ********/
   170  	add("internal/runtime/sys", "GetCallerPC",
   171  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   172  			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
   173  		},
   174  		all...)
   175  
   176  	add("internal/runtime/sys", "GetCallerSP",
   177  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   178  			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
   179  		},
   180  		all...)
   181  
   182  	add("internal/runtime/sys", "GetClosurePtr",
   183  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   184  			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
   185  		},
   186  		all...)
   187  
   188  	addF("internal/runtime/sys", "Bswap32",
   189  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   190  			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   191  		},
   192  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   193  	addF("internal/runtime/sys", "Bswap64",
   194  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   195  			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   196  		},
   197  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   198  
   199  	addF("runtime", "memequal",
   200  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   201  			return s.newValue4(ssa.OpMemEq, s.f.Config.Types.Bool, args[0], args[1], args[2], s.mem())
   202  		},
   203  		sys.ARM64)
   204  
   205  	if cfg.goppc64 >= 10 {
   206  		// Use only on Power10 as the new byte reverse instructions that Power10 provide
   207  		// make it worthwhile as an intrinsic
   208  		addF("internal/runtime/sys", "Bswap32",
   209  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   210  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   211  			},
   212  			sys.PPC64)
   213  		addF("internal/runtime/sys", "Bswap64",
   214  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   215  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   216  			},
   217  			sys.PPC64)
   218  	}
   219  
   220  	if cfg.goriscv64 >= 22 {
   221  		addF("internal/runtime/sys", "Bswap32",
   222  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   223  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   224  			},
   225  			sys.RISCV64)
   226  		addF("internal/runtime/sys", "Bswap64",
   227  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   228  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   229  			},
   230  			sys.RISCV64)
   231  	}
   232  
   233  	/****** Prefetch ******/
   234  	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   235  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   236  			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
   237  			return nil
   238  		}
   239  	}
   240  
   241  	// Make Prefetch intrinsics for supported platforms
   242  	// On the unsupported platforms stub function will be eliminated
   243  	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
   244  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
   245  	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
   246  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
   247  
   248  	/******** internal/runtime/atomic ********/
   249  	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
   250  
   251  	addF("internal/runtime/atomic", "Load",
   252  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   253  			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   254  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   255  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   256  		},
   257  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   258  	addF("internal/runtime/atomic", "Load8",
   259  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   260  			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
   261  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   262  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   263  		},
   264  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   265  	addF("internal/runtime/atomic", "Load64",
   266  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   267  			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   268  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   269  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   270  		},
   271  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   272  	addF("internal/runtime/atomic", "LoadAcq",
   273  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   274  			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   275  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   276  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   277  		},
   278  		sys.PPC64)
   279  	addF("internal/runtime/atomic", "LoadAcq64",
   280  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   281  			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   282  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   283  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   284  		},
   285  		sys.PPC64)
   286  	addF("internal/runtime/atomic", "Loadp",
   287  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   288  			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
   289  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   290  			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
   291  		},
   292  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   293  
   294  	addF("internal/runtime/atomic", "Store",
   295  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   296  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
   297  			return nil
   298  		},
   299  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   300  	addF("internal/runtime/atomic", "Store8",
   301  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   302  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
   303  			return nil
   304  		},
   305  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   306  	addF("internal/runtime/atomic", "Store64",
   307  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   308  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
   309  			return nil
   310  		},
   311  		sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   312  	addF("internal/runtime/atomic", "StorepNoWB",
   313  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   314  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
   315  			return nil
   316  		},
   317  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
   318  	addF("internal/runtime/atomic", "StoreRel",
   319  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   320  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
   321  			return nil
   322  		},
   323  		sys.PPC64)
   324  	addF("internal/runtime/atomic", "StoreRel64",
   325  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   326  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
   327  			return nil
   328  		},
   329  		sys.PPC64)
   330  
   331  	makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   332  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   333  			// Target Atomic feature is identified by dynamic detection
   334  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   335  			v := s.load(types.Types[types.TBOOL], addr)
   336  			b := s.endBlock()
   337  			b.Kind = ssa.BlockIf
   338  			b.SetControl(v)
   339  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   340  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   341  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   342  			b.AddEdgeTo(bTrue)
   343  			b.AddEdgeTo(bFalse)
   344  			b.Likely = ssa.BranchLikely
   345  
   346  			// We have atomic instructions - use it directly.
   347  			s.startBlock(bTrue)
   348  			emit(s, n, args, op1, typ, false)
   349  			s.endBlock().AddEdgeTo(bEnd)
   350  
   351  			// Use original instruction sequence.
   352  			s.startBlock(bFalse)
   353  			emit(s, n, args, op0, typ, false)
   354  			s.endBlock().AddEdgeTo(bEnd)
   355  
   356  			// Merge results.
   357  			s.startBlock(bEnd)
   358  
   359  			return nil
   360  		}
   361  	}
   362  
   363  	atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   364  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   365  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   366  		if needReturn {
   367  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   368  		}
   369  	}
   370  
   371  	addF("internal/runtime/atomic", "Store8",
   372  		makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
   373  		sys.Loong64)
   374  	addF("internal/runtime/atomic", "Store",
   375  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   376  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
   377  			return nil
   378  		},
   379  		sys.Loong64)
   380  	addF("internal/runtime/atomic", "Store64",
   381  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   382  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
   383  			return nil
   384  		},
   385  		sys.Loong64)
   386  
   387  	addF("internal/runtime/atomic", "Xchg8",
   388  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   389  			v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   390  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   391  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   392  		},
   393  		sys.AMD64, sys.PPC64)
   394  	addF("internal/runtime/atomic", "Xchg",
   395  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   396  			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   397  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   398  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   399  		},
   400  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   401  	addF("internal/runtime/atomic", "Xchg64",
   402  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   403  			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   404  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   405  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   406  		},
   407  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   408  
   409  	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
   410  
   411  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   412  			if cfg.goarm64.LSE {
   413  				emit(s, n, args, op1, typ, needReturn)
   414  			} else {
   415  				// Target Atomic feature is identified by dynamic detection
   416  				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
   417  				v := s.load(types.Types[types.TBOOL], addr)
   418  				b := s.endBlock()
   419  				b.Kind = ssa.BlockIf
   420  				b.SetControl(v)
   421  				bTrue := s.f.NewBlock(ssa.BlockPlain)
   422  				bFalse := s.f.NewBlock(ssa.BlockPlain)
   423  				bEnd := s.f.NewBlock(ssa.BlockPlain)
   424  				b.AddEdgeTo(bTrue)
   425  				b.AddEdgeTo(bFalse)
   426  				b.Likely = ssa.BranchLikely
   427  
   428  				// We have atomic instructions - use it directly.
   429  				s.startBlock(bTrue)
   430  				emit(s, n, args, op1, typ, needReturn)
   431  				s.endBlock().AddEdgeTo(bEnd)
   432  
   433  				// Use original instruction sequence.
   434  				s.startBlock(bFalse)
   435  				emit(s, n, args, op0, typ, needReturn)
   436  				s.endBlock().AddEdgeTo(bEnd)
   437  
   438  				// Merge results.
   439  				s.startBlock(bEnd)
   440  			}
   441  			if needReturn {
   442  				return s.variable(n, types.Types[typ])
   443  			} else {
   444  				return nil
   445  			}
   446  		}
   447  	}
   448  	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   449  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
   450  	}
   451  	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   452  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
   453  	}
   454  
   455  	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   456  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   457  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   458  		if needReturn {
   459  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   460  		}
   461  	}
   462  	addF("internal/runtime/atomic", "Xchg8",
   463  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64),
   464  		sys.ARM64)
   465  	addF("internal/runtime/atomic", "Xchg",
   466  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
   467  		sys.ARM64)
   468  	addF("internal/runtime/atomic", "Xchg64",
   469  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
   470  		sys.ARM64)
   471  
   472  	makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   473  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   474  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   475  			v := s.load(types.Types[types.TBOOL], addr)
   476  			b := s.endBlock()
   477  			b.Kind = ssa.BlockIf
   478  			b.SetControl(v)
   479  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   480  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   481  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   482  			b.AddEdgeTo(bTrue)
   483  			b.AddEdgeTo(bFalse)
   484  			b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b
   485  
   486  			// We have the intrinsic - use it directly.
   487  			s.startBlock(bTrue)
   488  			s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   489  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n])
   490  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n])
   491  			s.endBlock().AddEdgeTo(bEnd)
   492  
   493  			// Call the pure Go version.
   494  			s.startBlock(bFalse)
   495  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8]
   496  			s.endBlock().AddEdgeTo(bEnd)
   497  
   498  			// Merge results.
   499  			s.startBlock(bEnd)
   500  			return s.variable(n, types.Types[types.TUINT8])
   501  		}
   502  	}
   503  	addF("internal/runtime/atomic", "Xchg8",
   504  		makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant),
   505  		sys.Loong64)
   506  
   507  	addF("internal/runtime/atomic", "Xadd",
   508  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   509  			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   510  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   511  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   512  		},
   513  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   514  	addF("internal/runtime/atomic", "Xadd64",
   515  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   516  			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   517  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   518  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   519  		},
   520  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   521  
   522  	addF("internal/runtime/atomic", "Xadd",
   523  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
   524  		sys.ARM64)
   525  	addF("internal/runtime/atomic", "Xadd64",
   526  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
   527  		sys.ARM64)
   528  
   529  	addF("internal/runtime/atomic", "Cas",
   530  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   531  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   532  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   533  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   534  		},
   535  		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   536  	addF("internal/runtime/atomic", "Cas64",
   537  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   538  			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   539  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   540  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   541  		},
   542  		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   543  	addF("internal/runtime/atomic", "CasRel",
   544  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   545  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   546  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   547  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   548  		},
   549  		sys.PPC64)
   550  
   551  	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   552  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   553  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   554  		if needReturn {
   555  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   556  		}
   557  	}
   558  
   559  	addF("internal/runtime/atomic", "Cas",
   560  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
   561  		sys.ARM64)
   562  	addF("internal/runtime/atomic", "Cas64",
   563  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
   564  		sys.ARM64)
   565  
   566  	atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   567  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   568  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   569  		if needReturn {
   570  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   571  		}
   572  	}
   573  
   574  	makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder {
   575  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   576  			// Target Atomic feature is identified by dynamic detection
   577  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb)
   578  			v := s.load(types.Types[types.TBOOL], addr)
   579  			b := s.endBlock()
   580  			b.Kind = ssa.BlockIf
   581  			b.SetControl(v)
   582  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   583  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   584  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   585  			b.AddEdgeTo(bTrue)
   586  			b.AddEdgeTo(bFalse)
   587  			b.Likely = ssa.BranchLikely
   588  
   589  			// We have atomic instructions - use it directly.
   590  			s.startBlock(bTrue)
   591  			emit(s, n, args, op1, types.TBOOL, true)
   592  			s.endBlock().AddEdgeTo(bEnd)
   593  
   594  			// Use original instruction sequence.
   595  			s.startBlock(bFalse)
   596  			emit(s, n, args, op0, types.TBOOL, true)
   597  			s.endBlock().AddEdgeTo(bEnd)
   598  
   599  			// Merge results.
   600  			s.startBlock(bEnd)
   601  
   602  			return s.variable(n, types.Types[types.TBOOL])
   603  		}
   604  	}
   605  
   606  	addF("internal/runtime/atomic", "Cas",
   607  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64),
   608  		sys.Loong64)
   609  	addF("internal/runtime/atomic", "Cas64",
   610  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64),
   611  		sys.Loong64)
   612  
   613  	// Old-style atomic logical operation API (all supported archs except arm64).
   614  	addF("internal/runtime/atomic", "And8",
   615  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   616  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
   617  			return nil
   618  		},
   619  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   620  	addF("internal/runtime/atomic", "And",
   621  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   622  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
   623  			return nil
   624  		},
   625  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   626  	addF("internal/runtime/atomic", "Or8",
   627  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   628  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
   629  			return nil
   630  		},
   631  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   632  	addF("internal/runtime/atomic", "Or",
   633  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   634  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
   635  			return nil
   636  		},
   637  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   638  
   639  	// arm64 always uses the new-style atomic logical operations, for both the
   640  	// old and new style API.
   641  	addF("internal/runtime/atomic", "And8",
   642  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
   643  		sys.ARM64)
   644  	addF("internal/runtime/atomic", "Or8",
   645  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
   646  		sys.ARM64)
   647  	addF("internal/runtime/atomic", "And64",
   648  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
   649  		sys.ARM64)
   650  	addF("internal/runtime/atomic", "And32",
   651  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   652  		sys.ARM64)
   653  	addF("internal/runtime/atomic", "And",
   654  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   655  		sys.ARM64)
   656  	addF("internal/runtime/atomic", "Or64",
   657  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
   658  		sys.ARM64)
   659  	addF("internal/runtime/atomic", "Or32",
   660  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   661  		sys.ARM64)
   662  	addF("internal/runtime/atomic", "Or",
   663  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   664  		sys.ARM64)
   665  
   666  	// New-style atomic logical operations, which return the old memory value.
   667  	addF("internal/runtime/atomic", "And64",
   668  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   669  			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   670  			p0, p1 := s.split(v)
   671  			s.vars[memVar] = p1
   672  			return p0
   673  		},
   674  		sys.AMD64, sys.Loong64)
   675  	addF("internal/runtime/atomic", "And32",
   676  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   677  			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   678  			p0, p1 := s.split(v)
   679  			s.vars[memVar] = p1
   680  			return p0
   681  		},
   682  		sys.AMD64, sys.Loong64)
   683  	addF("internal/runtime/atomic", "Or64",
   684  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   685  			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   686  			p0, p1 := s.split(v)
   687  			s.vars[memVar] = p1
   688  			return p0
   689  		},
   690  		sys.AMD64, sys.Loong64)
   691  	addF("internal/runtime/atomic", "Or32",
   692  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   693  			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   694  			p0, p1 := s.split(v)
   695  			s.vars[memVar] = p1
   696  			return p0
   697  		},
   698  		sys.AMD64, sys.Loong64)
   699  
   700  	// Aliases for atomic load operations
   701  	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
   702  	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
   703  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
   704  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
   705  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
   706  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
   707  	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
   708  	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
   709  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
   710  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
   711  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
   712  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
   713  
   714  	// Aliases for atomic store operations
   715  	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
   716  	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
   717  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
   718  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
   719  	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
   720  	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
   721  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
   722  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
   723  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
   724  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
   725  
   726  	// Aliases for atomic swap operations
   727  	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
   728  	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
   729  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
   730  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
   731  
   732  	// Aliases for atomic add operations
   733  	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
   734  	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
   735  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
   736  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
   737  
   738  	// Aliases for atomic CAS operations
   739  	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
   740  	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
   741  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
   742  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
   743  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
   744  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
   745  	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
   746  
   747  	// Aliases for atomic And/Or operations
   748  	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64)
   749  	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64)
   750  
   751  	/******** math ********/
   752  	addF("math", "sqrt",
   753  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   754  			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
   755  		},
   756  		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
   757  	addF("math", "Trunc",
   758  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   759  			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
   760  		},
   761  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   762  	addF("math", "Ceil",
   763  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   764  			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
   765  		},
   766  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   767  	addF("math", "Floor",
   768  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   769  			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
   770  		},
   771  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   772  	addF("math", "Round",
   773  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   774  			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
   775  		},
   776  		sys.ARM64, sys.PPC64, sys.S390X)
   777  	addF("math", "RoundToEven",
   778  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   779  			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
   780  		},
   781  		sys.ARM64, sys.S390X, sys.Wasm)
   782  	addF("math", "Abs",
   783  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   784  			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
   785  		},
   786  		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
   787  	addF("math", "Copysign",
   788  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   789  			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
   790  		},
   791  		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
   792  	addF("math", "FMA",
   793  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   794  			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   795  		},
   796  		sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
   797  	addF("math", "FMA",
   798  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   799  			if cfg.goamd64 >= 3 {
   800  				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   801  			}
   802  
   803  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
   804  			b := s.endBlock()
   805  			b.Kind = ssa.BlockIf
   806  			b.SetControl(v)
   807  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   808  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   809  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   810  			b.AddEdgeTo(bTrue)
   811  			b.AddEdgeTo(bFalse)
   812  			b.Likely = ssa.BranchLikely // >= haswell cpus are common
   813  
   814  			// We have the intrinsic - use it directly.
   815  			s.startBlock(bTrue)
   816  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   817  			s.endBlock().AddEdgeTo(bEnd)
   818  
   819  			// Call the pure Go version.
   820  			s.startBlock(bFalse)
   821  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   822  			s.endBlock().AddEdgeTo(bEnd)
   823  
   824  			// Merge results.
   825  			s.startBlock(bEnd)
   826  			return s.variable(n, types.Types[types.TFLOAT64])
   827  		},
   828  		sys.AMD64)
   829  	addF("math", "FMA",
   830  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   831  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
   832  			v := s.load(types.Types[types.TBOOL], addr)
   833  			b := s.endBlock()
   834  			b.Kind = ssa.BlockIf
   835  			b.SetControl(v)
   836  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   837  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   838  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   839  			b.AddEdgeTo(bTrue)
   840  			b.AddEdgeTo(bFalse)
   841  			b.Likely = ssa.BranchLikely
   842  
   843  			// We have the intrinsic - use it directly.
   844  			s.startBlock(bTrue)
   845  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   846  			s.endBlock().AddEdgeTo(bEnd)
   847  
   848  			// Call the pure Go version.
   849  			s.startBlock(bFalse)
   850  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   851  			s.endBlock().AddEdgeTo(bEnd)
   852  
   853  			// Merge results.
   854  			s.startBlock(bEnd)
   855  			return s.variable(n, types.Types[types.TFLOAT64])
   856  		},
   857  		sys.ARM)
   858  
   859  	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   860  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   861  			if cfg.goamd64 >= 2 {
   862  				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   863  			}
   864  
   865  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
   866  			b := s.endBlock()
   867  			b.Kind = ssa.BlockIf
   868  			b.SetControl(v)
   869  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   870  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   871  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   872  			b.AddEdgeTo(bTrue)
   873  			b.AddEdgeTo(bFalse)
   874  			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
   875  
   876  			// We have the intrinsic - use it directly.
   877  			s.startBlock(bTrue)
   878  			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   879  			s.endBlock().AddEdgeTo(bEnd)
   880  
   881  			// Call the pure Go version.
   882  			s.startBlock(bFalse)
   883  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   884  			s.endBlock().AddEdgeTo(bEnd)
   885  
   886  			// Merge results.
   887  			s.startBlock(bEnd)
   888  			return s.variable(n, types.Types[types.TFLOAT64])
   889  		}
   890  	}
   891  	addF("math", "RoundToEven",
   892  		makeRoundAMD64(ssa.OpRoundToEven),
   893  		sys.AMD64)
   894  	addF("math", "Floor",
   895  		makeRoundAMD64(ssa.OpFloor),
   896  		sys.AMD64)
   897  	addF("math", "Ceil",
   898  		makeRoundAMD64(ssa.OpCeil),
   899  		sys.AMD64)
   900  	addF("math", "Trunc",
   901  		makeRoundAMD64(ssa.OpTrunc),
   902  		sys.AMD64)
   903  
   904  	/******** math/bits ********/
   905  	addF("math/bits", "TrailingZeros64",
   906  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   907  			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   908  		},
   909  		sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   910  	addF("math/bits", "TrailingZeros64",
   911  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   912  			lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
   913  			hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
   914  			return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
   915  		},
   916  		sys.I386)
   917  	addF("math/bits", "TrailingZeros32",
   918  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   919  			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   920  		},
   921  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   922  	addF("math/bits", "TrailingZeros16",
   923  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   924  			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   925  		},
   926  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   927  	addF("math/bits", "TrailingZeros8",
   928  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   929  			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   930  		},
   931  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   932  
   933  	if cfg.goriscv64 >= 22 {
   934  		addF("math/bits", "TrailingZeros64",
   935  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   936  				return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   937  			},
   938  			sys.RISCV64)
   939  		addF("math/bits", "TrailingZeros32",
   940  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   941  				return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   942  			},
   943  			sys.RISCV64)
   944  		addF("math/bits", "TrailingZeros16",
   945  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   946  				return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   947  			},
   948  			sys.RISCV64)
   949  		addF("math/bits", "TrailingZeros8",
   950  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   951  				return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   952  			},
   953  			sys.RISCV64)
   954  	}
   955  
   956  	// ReverseBytes inlines correctly, no need to intrinsify it.
   957  	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
   958  	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
   959  	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
   960  	addF("math/bits", "ReverseBytes16",
   961  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   962  			return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   963  		},
   964  		sys.Loong64)
   965  	if cfg.goppc64 >= 10 {
   966  		// On Power10, 16-bit rotate is not available so use BRH instruction
   967  		addF("math/bits", "ReverseBytes16",
   968  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   969  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
   970  			},
   971  			sys.PPC64)
   972  	}
   973  	if cfg.goriscv64 >= 22 {
   974  		addF("math/bits", "ReverseBytes16",
   975  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   976  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   977  			},
   978  			sys.RISCV64)
   979  	}
   980  
   981  	addF("math/bits", "Len64",
   982  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   983  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
   984  		},
   985  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   986  	addF("math/bits", "Len32",
   987  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   988  			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
   989  		},
   990  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   991  	addF("math/bits", "Len16",
   992  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   993  			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
   994  		},
   995  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   996  	addF("math/bits", "Len8",
   997  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   998  			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
   999  		},
  1000  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
  1001  
  1002  	if cfg.goriscv64 >= 22 {
  1003  		addF("math/bits", "Len64",
  1004  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1005  				return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
  1006  			},
  1007  			sys.RISCV64)
  1008  		addF("math/bits", "Len32",
  1009  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1010  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
  1011  			},
  1012  			sys.RISCV64)
  1013  		addF("math/bits", "Len16",
  1014  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1015  				return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
  1016  			},
  1017  			sys.RISCV64)
  1018  		addF("math/bits", "Len8",
  1019  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1020  				return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
  1021  			},
  1022  			sys.RISCV64)
  1023  	}
  1024  
  1025  	alias("math/bits", "Len", "math/bits", "Len64", p8...)
  1026  	alias("math/bits", "Len", "math/bits", "Len32", p4...)
  1027  
  1028  	// LeadingZeros is handled because it trivially calls Len.
  1029  	addF("math/bits", "Reverse64",
  1030  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1031  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1032  		},
  1033  		sys.ARM64, sys.Loong64)
  1034  	addF("math/bits", "Reverse32",
  1035  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1036  			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
  1037  		},
  1038  		sys.ARM64, sys.Loong64)
  1039  	addF("math/bits", "Reverse16",
  1040  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1041  			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
  1042  		},
  1043  		sys.ARM64, sys.Loong64)
  1044  	addF("math/bits", "Reverse8",
  1045  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1046  			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
  1047  		},
  1048  		sys.ARM64, sys.Loong64)
  1049  	addF("math/bits", "Reverse",
  1050  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1051  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1052  		},
  1053  		sys.ARM64, sys.Loong64)
  1054  	addF("math/bits", "RotateLeft8",
  1055  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1056  			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
  1057  		},
  1058  		sys.AMD64, sys.RISCV64)
  1059  	addF("math/bits", "RotateLeft16",
  1060  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1061  			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
  1062  		},
  1063  		sys.AMD64, sys.RISCV64)
  1064  	addF("math/bits", "RotateLeft32",
  1065  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1066  			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
  1067  		},
  1068  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1069  	addF("math/bits", "RotateLeft64",
  1070  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1071  			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
  1072  		},
  1073  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1074  	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
  1075  
  1076  	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1077  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1078  			if cfg.goamd64 >= 2 {
  1079  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1080  			}
  1081  
  1082  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
  1083  			b := s.endBlock()
  1084  			b.Kind = ssa.BlockIf
  1085  			b.SetControl(v)
  1086  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1087  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1088  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1089  			b.AddEdgeTo(bTrue)
  1090  			b.AddEdgeTo(bFalse)
  1091  			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
  1092  
  1093  			// We have the intrinsic - use it directly.
  1094  			s.startBlock(bTrue)
  1095  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1096  			s.endBlock().AddEdgeTo(bEnd)
  1097  
  1098  			// Call the pure Go version.
  1099  			s.startBlock(bFalse)
  1100  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1101  			s.endBlock().AddEdgeTo(bEnd)
  1102  
  1103  			// Merge results.
  1104  			s.startBlock(bEnd)
  1105  			return s.variable(n, types.Types[types.TINT])
  1106  		}
  1107  	}
  1108  
  1109  	makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1110  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1111  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb)
  1112  			v := s.load(types.Types[types.TBOOL], addr)
  1113  			b := s.endBlock()
  1114  			b.Kind = ssa.BlockIf
  1115  			b.SetControl(v)
  1116  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1117  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1118  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1119  			b.AddEdgeTo(bTrue)
  1120  			b.AddEdgeTo(bFalse)
  1121  			b.Likely = ssa.BranchLikely // most loong64 machines support the LSX
  1122  
  1123  			// We have the intrinsic - use it directly.
  1124  			s.startBlock(bTrue)
  1125  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1126  			s.endBlock().AddEdgeTo(bEnd)
  1127  
  1128  			// Call the pure Go version.
  1129  			s.startBlock(bFalse)
  1130  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1131  			s.endBlock().AddEdgeTo(bEnd)
  1132  
  1133  			// Merge results.
  1134  			s.startBlock(bEnd)
  1135  			return s.variable(n, types.Types[types.TINT])
  1136  		}
  1137  	}
  1138  
  1139  	makeOnesCountRISCV64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1140  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1141  			if cfg.goriscv64 >= 22 {
  1142  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1143  			}
  1144  
  1145  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.RISCV64HasZbb, s.sb)
  1146  			v := s.load(types.Types[types.TBOOL], addr)
  1147  			b := s.endBlock()
  1148  			b.Kind = ssa.BlockIf
  1149  			b.SetControl(v)
  1150  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1151  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1152  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1153  			b.AddEdgeTo(bTrue)
  1154  			b.AddEdgeTo(bFalse)
  1155  			b.Likely = ssa.BranchLikely // Majority of RISC-V support Zbb.
  1156  
  1157  			// We have the intrinsic - use it directly.
  1158  			s.startBlock(bTrue)
  1159  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1160  			s.endBlock().AddEdgeTo(bEnd)
  1161  
  1162  			// Call the pure Go version.
  1163  			s.startBlock(bFalse)
  1164  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1165  			s.endBlock().AddEdgeTo(bEnd)
  1166  
  1167  			// Merge results.
  1168  			s.startBlock(bEnd)
  1169  			return s.variable(n, types.Types[types.TINT])
  1170  		}
  1171  	}
  1172  
  1173  	addF("math/bits", "OnesCount64",
  1174  		makeOnesCountAMD64(ssa.OpPopCount64),
  1175  		sys.AMD64)
  1176  	addF("math/bits", "OnesCount64",
  1177  		makeOnesCountLoong64(ssa.OpPopCount64),
  1178  		sys.Loong64)
  1179  	addF("math/bits", "OnesCount64",
  1180  		makeOnesCountRISCV64(ssa.OpPopCount64),
  1181  		sys.RISCV64)
  1182  	addF("math/bits", "OnesCount64",
  1183  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1184  			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
  1185  		},
  1186  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1187  	addF("math/bits", "OnesCount32",
  1188  		makeOnesCountAMD64(ssa.OpPopCount32),
  1189  		sys.AMD64)
  1190  	addF("math/bits", "OnesCount32",
  1191  		makeOnesCountLoong64(ssa.OpPopCount32),
  1192  		sys.Loong64)
  1193  	addF("math/bits", "OnesCount32",
  1194  		makeOnesCountRISCV64(ssa.OpPopCount32),
  1195  		sys.RISCV64)
  1196  	addF("math/bits", "OnesCount32",
  1197  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1198  			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
  1199  		},
  1200  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1201  	addF("math/bits", "OnesCount16",
  1202  		makeOnesCountAMD64(ssa.OpPopCount16),
  1203  		sys.AMD64)
  1204  	addF("math/bits", "OnesCount16",
  1205  		makeOnesCountLoong64(ssa.OpPopCount16),
  1206  		sys.Loong64)
  1207  	addF("math/bits", "OnesCount16",
  1208  		makeOnesCountRISCV64(ssa.OpPopCount16),
  1209  		sys.RISCV64)
  1210  	addF("math/bits", "OnesCount16",
  1211  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1212  			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
  1213  		},
  1214  		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
  1215  	addF("math/bits", "OnesCount8",
  1216  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1217  			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
  1218  		},
  1219  		sys.S390X, sys.PPC64, sys.Wasm)
  1220  
  1221  	if cfg.goriscv64 >= 22 {
  1222  		addF("math/bits", "OnesCount8",
  1223  			makeOnesCountRISCV64(ssa.OpPopCount8),
  1224  			sys.RISCV64)
  1225  	}
  1226  
  1227  	alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...)
  1228  
  1229  	add("math/bits", "Mul64",
  1230  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1231  			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
  1232  		},
  1233  		all...)
  1234  	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
  1235  	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
  1236  	addF("math/bits", "Add64",
  1237  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1238  			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1239  		},
  1240  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1241  	alias("math/bits", "Add", "math/bits", "Add64", p8...)
  1242  	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
  1243  	addF("math/bits", "Sub64",
  1244  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1245  			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1246  		},
  1247  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1248  	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
  1249  	addF("math/bits", "Div64",
  1250  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1251  			// check for divide-by-zero/overflow and panic with appropriate message
  1252  			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
  1253  			s.check(cmpZero, ir.Syms.Panicdivide)
  1254  			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
  1255  			s.check(cmpOverflow, ir.Syms.Panicoverflow)
  1256  			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1257  		},
  1258  		sys.AMD64)
  1259  	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
  1260  
  1261  	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
  1262  	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
  1263  	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
  1264  	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
  1265  	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
  1266  	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
  1267  
  1268  	/******** sync/atomic ********/
  1269  
  1270  	// Note: these are disabled by flag_race in findIntrinsic below.
  1271  	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
  1272  	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
  1273  	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
  1274  	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
  1275  	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
  1276  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
  1277  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
  1278  
  1279  	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
  1280  	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
  1281  	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
  1282  	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
  1283  	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
  1284  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
  1285  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
  1286  
  1287  	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
  1288  	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
  1289  	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
  1290  	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
  1291  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
  1292  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
  1293  
  1294  	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
  1295  	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
  1296  	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
  1297  	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
  1298  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
  1299  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
  1300  
  1301  	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
  1302  	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
  1303  	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
  1304  	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
  1305  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
  1306  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
  1307  
  1308  	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1309  	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1310  	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1311  	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1312  	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1313  	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1314  	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1315  	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1316  	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1317  	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1318  
  1319  	/******** math/big ********/
  1320  	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
  1321  
  1322  	/******** internal/runtime/maps ********/
  1323  
  1324  	// Important: The intrinsic implementations below return a packed
  1325  	// bitset, while the portable Go implementation uses an unpacked
  1326  	// representation (one bit set in each byte).
  1327  	//
  1328  	// Thus we must replace most bitset methods with implementations that
  1329  	// work with the packed representation.
  1330  	//
  1331  	// TODO(prattmic): The bitset implementations don't use SIMD, so they
  1332  	// could be handled with build tags (though that would break
  1333  	// -d=ssa/intrinsics/off=1).
  1334  
  1335  	// With a packed representation we no longer need to shift the result
  1336  	// of TrailingZeros64.
  1337  	alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64)
  1338  
  1339  	addF("internal/runtime/maps", "bitsetRemoveBelow",
  1340  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1341  			b := args[0]
  1342  			i := args[1]
  1343  
  1344  			// Clear the lower i bits in b.
  1345  			//
  1346  			// out = b &^ ((1 << i) - 1)
  1347  
  1348  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1349  
  1350  			mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i)
  1351  			mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one)
  1352  			mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask)
  1353  
  1354  			return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask)
  1355  		},
  1356  		sys.AMD64)
  1357  
  1358  	addF("internal/runtime/maps", "bitsetLowestSet",
  1359  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1360  			b := args[0]
  1361  
  1362  			// Test the lowest bit in b.
  1363  			//
  1364  			// out = (b & 1) == 1
  1365  
  1366  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1367  			and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one)
  1368  			return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one)
  1369  		},
  1370  		sys.AMD64)
  1371  
  1372  	addF("internal/runtime/maps", "bitsetShiftOutLowest",
  1373  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1374  			b := args[0]
  1375  
  1376  			// Right shift out the lowest bit in b.
  1377  			//
  1378  			// out = b >> 1
  1379  
  1380  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1381  			return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one)
  1382  		},
  1383  		sys.AMD64)
  1384  
  1385  	addF("internal/runtime/maps", "ctrlGroupMatchH2",
  1386  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1387  			g := args[0]
  1388  			h := args[1]
  1389  
  1390  			// Explicit copies to fp registers. See
  1391  			// https://go.dev/issue/70451.
  1392  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1393  			hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h)
  1394  
  1395  			// Broadcast h2 into each byte of a word.
  1396  			var broadcast *ssa.Value
  1397  			if buildcfg.GOAMD64 >= 4 {
  1398  				// VPBROADCASTB saves 1 instruction vs PSHUFB
  1399  				// because the input can come from a GP
  1400  				// register, while PSHUFB requires moving into
  1401  				// an FP register first.
  1402  				//
  1403  				// Nominally PSHUFB would require a second
  1404  				// additional instruction to load the control
  1405  				// mask into a FP register. But broadcast uses
  1406  				// a control mask of 0, and the register ABI
  1407  				// already defines X15 as a zero register.
  1408  				broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h
  1409  			} else if buildcfg.GOAMD64 >= 2 {
  1410  				// PSHUFB performs a byte broadcast when given
  1411  				// a control input of 0.
  1412  				broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp)
  1413  			} else {
  1414  				// No direct byte broadcast. First we must
  1415  				// duplicate the lower byte and then do a
  1416  				// 16-bit broadcast.
  1417  
  1418  				// "Unpack" h2 with itself. This duplicates the
  1419  				// input, resulting in h2 in the lower two
  1420  				// bytes.
  1421  				unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp)
  1422  
  1423  				// Copy the lower 16-bits of unpack into every
  1424  				// 16-bit slot in the lower 64-bits of the
  1425  				// output register. Note that immediate 0
  1426  				// selects the low word as the source for every
  1427  				// destination slot.
  1428  				broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack)
  1429  
  1430  				// No need to broadcast into the upper 64-bits,
  1431  				// as we don't use those.
  1432  			}
  1433  
  1434  			// Compare each byte of the control word with h2. Each
  1435  			// matching byte has every bit set.
  1436  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp)
  1437  
  1438  			// Construct a "byte mask": each output bit is equal to
  1439  			// the sign bit each input byte.
  1440  			//
  1441  			// This results in a packed output (bit N set means
  1442  			// byte N matched).
  1443  			//
  1444  			// NOTE: See comment above on bitsetFirst.
  1445  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1446  
  1447  			// g is only 64-bits so the upper 64-bits of the
  1448  			// 128-bit register will be zero. If h2 is also zero,
  1449  			// then we'll get matches on those bytes. Truncate the
  1450  			// upper bits to ignore such matches.
  1451  			ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1452  
  1453  			return ret
  1454  		},
  1455  		sys.AMD64)
  1456  
  1457  	addF("internal/runtime/maps", "ctrlGroupMatchEmpty",
  1458  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1459  			// An empty slot is   1000 0000
  1460  			// A deleted slot is  1111 1110
  1461  			// A full slot is     0??? ????
  1462  
  1463  			g := args[0]
  1464  
  1465  			// Explicit copy to fp register. See
  1466  			// https://go.dev/issue/70451.
  1467  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1468  
  1469  			if buildcfg.GOAMD64 >= 2 {
  1470  				// "PSIGNB negates each data element of the
  1471  				// destination operand (the first operand) if
  1472  				// the signed integer value of the
  1473  				// corresponding data element in the source
  1474  				// operand (the second operand) is less than
  1475  				// zero. If the signed integer value of a data
  1476  				// element in the source operand is positive,
  1477  				// the corresponding data element in the
  1478  				// destination operand is unchanged. If a data
  1479  				// element in the source operand is zero, the
  1480  				// corresponding data element in the
  1481  				// destination operand is set to zero" - Intel SDM
  1482  				//
  1483  				// If we pass the group control word as both
  1484  				// arguments:
  1485  				// - Full slots are unchanged.
  1486  				// - Deleted slots are negated, becoming
  1487  				//   0000 0010.
  1488  				// - Empty slots are negated, becoming
  1489  				//   1000 0000 (unchanged!).
  1490  				//
  1491  				// The result is that only empty slots have the
  1492  				// sign bit set. We then use PMOVMSKB to
  1493  				// extract the sign bits.
  1494  				sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp)
  1495  
  1496  				// Construct a "byte mask": each output bit is
  1497  				// equal to the sign bit each input byte. The
  1498  				// sign bit is only set for empty or deleted
  1499  				// slots.
  1500  				//
  1501  				// This results in a packed output (bit N set
  1502  				// means byte N matched).
  1503  				//
  1504  				// NOTE: See comment above on bitsetFirst.
  1505  				ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], sign)
  1506  
  1507  				// g is only 64-bits so the upper 64-bits of
  1508  				// the 128-bit register will be zero. PSIGNB
  1509  				// will keep all of these bytes zero, so no
  1510  				// need to truncate.
  1511  
  1512  				return ret
  1513  			}
  1514  
  1515  			// No PSIGNB, simply do byte equality with ctrlEmpty.
  1516  
  1517  			// Load ctrlEmpty into each byte of a control word.
  1518  			var ctrlsEmpty uint64 = abi.MapCtrlEmpty
  1519  			e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty))
  1520  			// Explicit copy to fp register. See
  1521  			// https://go.dev/issue/70451.
  1522  			efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e)
  1523  
  1524  			// Compare each byte of the control word with ctrlEmpty. Each
  1525  			// matching byte has every bit set.
  1526  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp)
  1527  
  1528  			// Construct a "byte mask": each output bit is equal to
  1529  			// the sign bit each input byte.
  1530  			//
  1531  			// This results in a packed output (bit N set means
  1532  			// byte N matched).
  1533  			//
  1534  			// NOTE: See comment above on bitsetFirst.
  1535  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1536  
  1537  			// g is only 64-bits so the upper 64-bits of the
  1538  			// 128-bit register will be zero. The upper 64-bits of
  1539  			// efp are also zero, so we'll get matches on those
  1540  			// bytes. Truncate the upper bits to ignore such
  1541  			// matches.
  1542  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1543  		},
  1544  		sys.AMD64)
  1545  
  1546  	addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted",
  1547  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1548  			// An empty slot is   1000 0000
  1549  			// A deleted slot is  1111 1110
  1550  			// A full slot is     0??? ????
  1551  			//
  1552  			// A slot is empty or deleted iff bit 7 (sign bit) is
  1553  			// set.
  1554  
  1555  			g := args[0]
  1556  
  1557  			// Explicit copy to fp register. See
  1558  			// https://go.dev/issue/70451.
  1559  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1560  
  1561  			// Construct a "byte mask": each output bit is equal to
  1562  			// the sign bit each input byte. The sign bit is only
  1563  			// set for empty or deleted slots.
  1564  			//
  1565  			// This results in a packed output (bit N set means
  1566  			// byte N matched).
  1567  			//
  1568  			// NOTE: See comment above on bitsetFirst.
  1569  			ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1570  
  1571  			// g is only 64-bits so the upper 64-bits of the
  1572  			// 128-bit register will be zero. Zero will never match
  1573  			// ctrlEmpty or ctrlDeleted, so no need to truncate.
  1574  
  1575  			return ret
  1576  		},
  1577  		sys.AMD64)
  1578  
  1579  	addF("internal/runtime/maps", "ctrlGroupMatchFull",
  1580  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1581  			// An empty slot is   1000 0000
  1582  			// A deleted slot is  1111 1110
  1583  			// A full slot is     0??? ????
  1584  			//
  1585  			// A slot is full iff bit 7 (sign bit) is unset.
  1586  
  1587  			g := args[0]
  1588  
  1589  			// Explicit copy to fp register. See
  1590  			// https://go.dev/issue/70451.
  1591  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1592  
  1593  			// Construct a "byte mask": each output bit is equal to
  1594  			// the sign bit each input byte. The sign bit is only
  1595  			// set for empty or deleted slots.
  1596  			//
  1597  			// This results in a packed output (bit N set means
  1598  			// byte N matched).
  1599  			//
  1600  			// NOTE: See comment above on bitsetFirst.
  1601  			mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1602  
  1603  			// Invert the mask to set the bits for the full slots.
  1604  			out := s.newValue1(ssa.OpCom16, types.Types[types.TUINT16], mask)
  1605  
  1606  			// g is only 64-bits so the upper 64-bits of the
  1607  			// 128-bit register will be zero, with bit 7 unset.
  1608  			// Truncate the upper bits to ignore these.
  1609  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1610  		},
  1611  		sys.AMD64)
  1612  
  1613  	/******** crypto/internal/constanttime ********/
  1614  	// We implement a superset of the Select promise:
  1615  	// Select returns x if v != 0 and y if v == 0.
  1616  	add("crypto/internal/constanttime", "Select",
  1617  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1618  			v, x, y := args[0], args[1], args[2]
  1619  
  1620  			var checkOp ssa.Op
  1621  			var zero *ssa.Value
  1622  			switch s.config.PtrSize {
  1623  			case 8:
  1624  				checkOp = ssa.OpNeq64
  1625  				zero = s.constInt64(types.Types[types.TINT], 0)
  1626  			case 4:
  1627  				checkOp = ssa.OpNeq32
  1628  				zero = s.constInt32(types.Types[types.TINT], 0)
  1629  			default:
  1630  				panic("unreachable")
  1631  			}
  1632  			check := s.newValue2(checkOp, types.Types[types.TBOOL], zero, v)
  1633  
  1634  			return s.newValue3(ssa.OpCondSelect, types.Types[types.TINT], x, y, check)
  1635  		},
  1636  		sys.ArchAMD64, sys.ArchARM64, sys.ArchLoong64, sys.ArchPPC64, sys.ArchPPC64LE, sys.ArchWasm) // all with CMOV support.
  1637  	add("crypto/internal/constanttime", "boolToUint8",
  1638  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1639  			return s.newValue1(ssa.OpCvtBoolToUint8, types.Types[types.TUINT8], args[0])
  1640  		},
  1641  		all...)
  1642  
  1643  	if buildcfg.Experiment.SIMD {
  1644  		// Only enable intrinsics, if SIMD experiment.
  1645  		simdIntrinsics(addF)
  1646  
  1647  		addF(simdPackage, "ClearAVXUpperBits",
  1648  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1649  				s.vars[memVar] = s.newValue1(ssa.OpAMD64VZEROUPPER, types.TypeMem, s.mem())
  1650  				return nil
  1651  			},
  1652  			sys.AMD64)
  1653  
  1654  		addF(simdPackage, "Int8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1655  		addF(simdPackage, "Int16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1656  		addF(simdPackage, "Int32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1657  		addF(simdPackage, "Int64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1658  		addF(simdPackage, "Uint8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1659  		addF(simdPackage, "Uint16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1660  		addF(simdPackage, "Uint32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1661  		addF(simdPackage, "Uint64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1662  		addF(simdPackage, "Int8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1663  		addF(simdPackage, "Int16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1664  		addF(simdPackage, "Int32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1665  		addF(simdPackage, "Int64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1666  		addF(simdPackage, "Uint8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1667  		addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1668  		addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1669  		addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1670  		addF(simdPackage, "Float32x4.IsNaN", opLen1(ssa.OpIsNaNFloat32x4, types.TypeVec128), sys.AMD64)
  1671  		addF(simdPackage, "Float32x8.IsNaN", opLen1(ssa.OpIsNaNFloat32x8, types.TypeVec256), sys.AMD64)
  1672  		addF(simdPackage, "Float32x16.IsNaN", opLen1(ssa.OpIsNaNFloat32x16, types.TypeVec512), sys.AMD64)
  1673  		addF(simdPackage, "Float64x2.IsNaN", opLen1(ssa.OpIsNaNFloat64x2, types.TypeVec128), sys.AMD64)
  1674  		addF(simdPackage, "Float64x4.IsNaN", opLen1(ssa.OpIsNaNFloat64x4, types.TypeVec256), sys.AMD64)
  1675  		addF(simdPackage, "Float64x8.IsNaN", opLen1(ssa.OpIsNaNFloat64x8, types.TypeVec512), sys.AMD64)
  1676  
  1677  		// sfp4 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
  1678  		sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) {
  1679  			addF(simdPackage, method,
  1680  				func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1681  					x, a, b, c, d, y := args[0], args[1], args[2], args[3], args[4], args[5]
  1682  					if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 && c.Op == ssa.OpConst8 && d.Op == ssa.OpConst8 {
  1683  						z := select4FromPair(x, a, b, c, d, y, s, hwop, vectype)
  1684  						if z != nil {
  1685  							return z
  1686  						}
  1687  					}
  1688  					return s.callResult(n, callNormal)
  1689  				},
  1690  				sys.AMD64)
  1691  		}
  1692  
  1693  		sfp4("Int32x4.SelectFromPair", ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128)
  1694  		sfp4("Uint32x4.SelectFromPair", ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128)
  1695  		sfp4("Float32x4.SelectFromPair", ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128)
  1696  
  1697  		sfp4("Int32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256)
  1698  		sfp4("Uint32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256)
  1699  		sfp4("Float32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256)
  1700  
  1701  		sfp4("Int32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512)
  1702  		sfp4("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512)
  1703  		sfp4("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512)
  1704  
  1705  		// sfp2 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
  1706  		sfp2 := func(method string, hwop ssa.Op, vectype *types.Type, cscimm func(i, j uint8) int64) {
  1707  			addF(simdPackage, method,
  1708  				func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1709  					x, a, b, y := args[0], args[1], args[2], args[3]
  1710  					if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 {
  1711  						z := select2FromPair(x, a, b, y, s, hwop, vectype, cscimm)
  1712  						if z != nil {
  1713  							return z
  1714  						}
  1715  					}
  1716  					return s.callResult(n, callNormal)
  1717  				},
  1718  				sys.AMD64)
  1719  		}
  1720  
  1721  		sfp2("Uint64x2.SelectFromPair", ssa.OpconcatSelectedConstantUint64x2, types.TypeVec128, cscimm2)
  1722  		sfp2("Int64x2.SelectFromPair", ssa.OpconcatSelectedConstantInt64x2, types.TypeVec128, cscimm2)
  1723  		sfp2("Float64x2.SelectFromPair", ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, cscimm2)
  1724  
  1725  		sfp2("Uint64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, cscimm2g2)
  1726  		sfp2("Int64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x4, types.TypeVec256, cscimm2g2)
  1727  		sfp2("Float64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x4, types.TypeVec256, cscimm2g2)
  1728  
  1729  		sfp2("Uint64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, cscimm2g4)
  1730  		sfp2("Int64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x8, types.TypeVec512, cscimm2g4)
  1731  		sfp2("Float64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x8, types.TypeVec512, cscimm2g4)
  1732  
  1733  	}
  1734  }
  1735  
  1736  func cscimm4(a, b, c, d uint8) int64 {
  1737  	return se(a + b<<2 + c<<4 + d<<6)
  1738  }
  1739  
  1740  func cscimm2(a, b uint8) int64 {
  1741  	return se(a + b<<1)
  1742  }
  1743  
  1744  func cscimm2g2(a, b uint8) int64 {
  1745  	g := cscimm2(a, b)
  1746  	return int64(int8(g + g<<2))
  1747  }
  1748  
  1749  func cscimm2g4(a, b uint8) int64 {
  1750  	g := cscimm2g2(a, b)
  1751  	return int64(int8(g + g<<4))
  1752  }
  1753  
  1754  const (
  1755  	_LLLL = iota
  1756  	_HLLL
  1757  	_LHLL
  1758  	_HHLL
  1759  	_LLHL
  1760  	_HLHL
  1761  	_LHHL
  1762  	_HHHL
  1763  	_LLLH
  1764  	_HLLH
  1765  	_LHLH
  1766  	_HHLH
  1767  	_LLHH
  1768  	_HLHH
  1769  	_LHHH
  1770  	_HHHH
  1771  )
  1772  
  1773  const (
  1774  	_LL = iota
  1775  	_HL
  1776  	_LH
  1777  	_HH
  1778  )
  1779  
  1780  func select2FromPair(x, _a, _b, y *ssa.Value, s *state, op ssa.Op, t *types.Type, csc func(a, b uint8) int64) *ssa.Value {
  1781  	a, b := uint8(_a.AuxInt8()), uint8(_b.AuxInt8())
  1782  	if a > 3 || b > 3 {
  1783  		return nil
  1784  	}
  1785  	pattern := (a&2)>>1 + (b & 2)
  1786  	a, b = a&1, b&1
  1787  
  1788  	switch pattern {
  1789  	case _LL:
  1790  		return s.newValue2I(op, t, csc(a, b), x, x)
  1791  	case _HH:
  1792  		return s.newValue2I(op, t, csc(a, b), y, y)
  1793  	case _LH:
  1794  		return s.newValue2I(op, t, csc(a, b), x, y)
  1795  	case _HL:
  1796  		return s.newValue2I(op, t, csc(a, b), y, x)
  1797  	}
  1798  	panic("The preceding switch should have been exhaustive")
  1799  }
  1800  
  1801  func select4FromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value {
  1802  	a, b, c, d := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()), uint8(_c.AuxInt8()), uint8(_d.AuxInt8())
  1803  	if a > 7 || b > 7 || c > 7 || d > 7 {
  1804  		return nil
  1805  	}
  1806  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
  1807  
  1808  	a, b, c, d = a&3, b&3, c&3, d&3
  1809  
  1810  	switch pattern {
  1811  	case _LLLL:
  1812  		// TODO DETECT 0,1,2,3, 0,0,0,0
  1813  		return s.newValue2I(op, t, cscimm4(a, b, c, d), x, x)
  1814  	case _HHHH:
  1815  		// TODO DETECT 0,1,2,3, 0,0,0,0
  1816  		return s.newValue2I(op, t, cscimm4(a, b, c, d), y, y)
  1817  	case _LLHH:
  1818  		return s.newValue2I(op, t, cscimm4(a, b, c, d), x, y)
  1819  	case _HHLL:
  1820  		return s.newValue2I(op, t, cscimm4(a, b, c, d), y, x)
  1821  
  1822  	case _HLLL:
  1823  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
  1824  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
  1825  	case _LHLL:
  1826  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
  1827  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
  1828  	case _HLHH:
  1829  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
  1830  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
  1831  	case _LHHH:
  1832  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
  1833  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
  1834  
  1835  	case _LLLH:
  1836  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
  1837  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
  1838  	case _LLHL:
  1839  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
  1840  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
  1841  
  1842  	case _HHLH:
  1843  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
  1844  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
  1845  
  1846  	case _HHHL:
  1847  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
  1848  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
  1849  
  1850  	case _LHLH:
  1851  		z := s.newValue2I(op, t, cscimm4(a, c, b, d), x, y)
  1852  		return s.newValue2I(op, t, se(0b11_01_10_00), z, z)
  1853  	case _HLHL:
  1854  		z := s.newValue2I(op, t, cscimm4(b, d, a, c), x, y)
  1855  		return s.newValue2I(op, t, se(0b01_11_00_10), z, z)
  1856  	case _HLLH:
  1857  		z := s.newValue2I(op, t, cscimm4(b, c, a, d), x, y)
  1858  		return s.newValue2I(op, t, se(0b11_01_00_10), z, z)
  1859  	case _LHHL:
  1860  		z := s.newValue2I(op, t, cscimm4(a, d, b, c), x, y)
  1861  		return s.newValue2I(op, t, se(0b01_11_10_00), z, z)
  1862  	}
  1863  	panic("The preceding switch should have been exhaustive")
  1864  }
  1865  
  1866  // se smears the not-really-a-sign bit of a uint8 to conform to the conventions
  1867  // for representing AuxInt in ssa.
  1868  func se(x uint8) int64 {
  1869  	return int64(int8(x))
  1870  }
  1871  
  1872  func opLen1(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1873  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1874  		return s.newValue1(op, t, args[0])
  1875  	}
  1876  }
  1877  
  1878  func opLen2(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1879  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1880  		return s.newValue2(op, t, args[0], args[1])
  1881  	}
  1882  }
  1883  
  1884  func opLen2_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1885  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1886  		return s.newValue2(op, t, args[1], args[0])
  1887  	}
  1888  }
  1889  
  1890  func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1891  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1892  		return s.newValue3(op, t, args[0], args[1], args[2])
  1893  	}
  1894  }
  1895  
  1896  var ssaVecBySize = map[int64]*types.Type{
  1897  	16: types.TypeVec128,
  1898  	32: types.TypeVec256,
  1899  	64: types.TypeVec512,
  1900  }
  1901  
  1902  func opLen3_31Zero3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1903  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1904  		if t, ok := ssaVecBySize[args[1].Type.Size()]; !ok {
  1905  			panic("unknown simd vector size")
  1906  		} else {
  1907  			return s.newValue3(op, t, s.newValue0(ssa.OpZeroSIMD, t), args[1], args[0])
  1908  		}
  1909  	}
  1910  }
  1911  
  1912  func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1913  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1914  		return s.newValue3(op, t, args[1], args[0], args[2])
  1915  	}
  1916  }
  1917  
  1918  func opLen3_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1919  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1920  		return s.newValue3(op, t, args[2], args[0], args[1])
  1921  	}
  1922  }
  1923  
  1924  func opLen4(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1925  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1926  		return s.newValue4(op, t, args[0], args[1], args[2], args[3])
  1927  	}
  1928  }
  1929  
  1930  func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1931  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1932  		return s.newValue4(op, t, args[2], args[0], args[1], args[3])
  1933  	}
  1934  }
  1935  
  1936  func opLen4_31(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1937  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1938  		return s.newValue4(op, t, args[2], args[1], args[0], args[3])
  1939  	}
  1940  }
  1941  
  1942  func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp func(*state, int)) *ssa.Value {
  1943  	// Make blocks we'll need.
  1944  	bEnd := s.f.NewBlock(ssa.BlockPlain)
  1945  
  1946  	if !idx.Type.IsKind(types.TUINT8) {
  1947  		panic("immJumpTable expects uint8 value")
  1948  	}
  1949  
  1950  	// We will exhaust 0-255, so no need to check the bounds.
  1951  	t := types.Types[types.TUINTPTR]
  1952  	idx = s.conv(nil, idx, idx.Type, t)
  1953  
  1954  	b := s.curBlock
  1955  	b.Kind = ssa.BlockJumpTable
  1956  	b.Pos = intrinsicCall.Pos()
  1957  	if base.Flag.Cfg.SpectreIndex {
  1958  		// Potential Spectre vulnerability hardening?
  1959  		idx = s.newValue2(ssa.OpSpectreSliceIndex, t, idx, s.uintptrConstant(255))
  1960  	}
  1961  	b.SetControl(idx)
  1962  	targets := [256]*ssa.Block{}
  1963  	for i := range 256 {
  1964  		t := s.f.NewBlock(ssa.BlockPlain)
  1965  		targets[i] = t
  1966  		b.AddEdgeTo(t)
  1967  	}
  1968  	s.endBlock()
  1969  
  1970  	for i, t := range targets {
  1971  		s.startBlock(t)
  1972  		genOp(s, i)
  1973  		if t.Kind != ssa.BlockExit {
  1974  			t.AddEdgeTo(bEnd)
  1975  		}
  1976  		s.endBlock()
  1977  	}
  1978  
  1979  	s.startBlock(bEnd)
  1980  	ret := s.variable(intrinsicCall, intrinsicCall.Type())
  1981  	return ret
  1982  }
  1983  
  1984  func opLen1Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1985  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1986  		if args[1].Op == ssa.OpConst8 {
  1987  			return s.newValue1I(op, t, args[1].AuxInt<<int64(offset), args[0])
  1988  		}
  1989  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  1990  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  1991  			s.vars[n] = sNew.newValue1I(op, t, int64(int8(idx<<offset)), args[0])
  1992  		})
  1993  	}
  1994  }
  1995  
  1996  func opLen2Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1997  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1998  		if args[1].Op == ssa.OpConst8 {
  1999  			return s.newValue2I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2])
  2000  		}
  2001  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2002  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2003  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[2])
  2004  		})
  2005  	}
  2006  }
  2007  
  2008  func opLen3Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2009  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2010  		if args[1].Op == ssa.OpConst8 {
  2011  			return s.newValue3I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3])
  2012  		}
  2013  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2014  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2015  			s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3])
  2016  		})
  2017  	}
  2018  }
  2019  
  2020  func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2021  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2022  		if args[2].Op == ssa.OpConst8 {
  2023  			return s.newValue2I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1])
  2024  		}
  2025  		return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
  2026  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2027  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[1])
  2028  		})
  2029  	}
  2030  }
  2031  
  2032  // Two immediates instead of just 1.  Offset is ignored, so it is a _ parameter instead.
  2033  func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2034  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2035  		if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 {
  2036  			i1, i2 := args[1].AuxInt, args[2].AuxInt
  2037  			return s.newValue2I(op, t, int64(int8(i1+i2<<4)), args[0], args[3])
  2038  		}
  2039  		four := s.constInt64(types.Types[types.TUINT8], 4)
  2040  		shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four)
  2041  		combined := s.newValue2(ssa.OpAdd8, types.Types[types.TUINT8], args[1], shifted)
  2042  		return immJumpTable(s, combined, n, func(sNew *state, idx int) {
  2043  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2044  			// TODO for "zeroing" values, panic instead.
  2045  			if idx & ^(3+3<<4) == 0 {
  2046  				s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx)), args[0], args[3])
  2047  			} else {
  2048  				sNew.rtcall(ir.Syms.PanicSimdImm, false, nil)
  2049  			}
  2050  		})
  2051  	}
  2052  }
  2053  
  2054  // The assembler requires the imm value of a SHA1RNDS4 instruction to be one of 0,1,2,3...
  2055  func opLen2Imm8_SHA1RNDS4(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2056  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2057  		if args[1].Op == ssa.OpConst8 {
  2058  			return s.newValue2I(op, t, (args[1].AuxInt<<int64(offset))&0b11, args[0], args[2])
  2059  		}
  2060  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2061  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2062  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset))&0b11, args[0], args[2])
  2063  		})
  2064  	}
  2065  }
  2066  
  2067  func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2068  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2069  		if args[2].Op == ssa.OpConst8 {
  2070  			return s.newValue3I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1], args[3])
  2071  		}
  2072  		return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
  2073  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2074  			s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[1], args[3])
  2075  		})
  2076  	}
  2077  }
  2078  
  2079  func opLen4Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2080  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2081  		if args[1].Op == ssa.OpConst8 {
  2082  			return s.newValue4I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3], args[4])
  2083  		}
  2084  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2085  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2086  			s.vars[n] = sNew.newValue4I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3], args[4])
  2087  		})
  2088  	}
  2089  }
  2090  
  2091  func simdLoad() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2092  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2093  		return s.newValue2(ssa.OpLoad, n.Type(), args[0], s.mem())
  2094  	}
  2095  }
  2096  
  2097  func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2098  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2099  		s.store(args[0].Type, args[1], args[0])
  2100  		return nil
  2101  	}
  2102  }
  2103  
  2104  var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{
  2105  	8:  {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64},
  2106  	16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32},
  2107  	32: {4: ssa.OpCvt8toMask32x4, 8: ssa.OpCvt8toMask32x8, 16: ssa.OpCvt16toMask32x16},
  2108  	64: {2: ssa.OpCvt8toMask64x2, 4: ssa.OpCvt8toMask64x4, 8: ssa.OpCvt8toMask64x8},
  2109  }
  2110  
  2111  var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{
  2112  	8:  {16: ssa.OpCvtMask8x16to16, 32: ssa.OpCvtMask8x32to32, 64: ssa.OpCvtMask8x64to64},
  2113  	16: {8: ssa.OpCvtMask16x8to8, 16: ssa.OpCvtMask16x16to16, 32: ssa.OpCvtMask16x32to32},
  2114  	32: {4: ssa.OpCvtMask32x4to8, 8: ssa.OpCvtMask32x8to8, 16: ssa.OpCvtMask32x16to16},
  2115  	64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8},
  2116  }
  2117  
  2118  func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2119  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2120  		op := cvtVToMaskOpcodes[elemBits][lanes]
  2121  		if op == 0 {
  2122  			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
  2123  		}
  2124  		return s.newValue1(op, types.TypeMask, args[0])
  2125  	}
  2126  }
  2127  
  2128  func simdCvtMaskToV(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2129  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2130  		op := cvtMaskToVOpcodes[elemBits][lanes]
  2131  		if op == 0 {
  2132  			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
  2133  		}
  2134  		return s.newValue1(op, n.Type(), args[0])
  2135  	}
  2136  }
  2137  
  2138  func simdMaskedLoad(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2139  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2140  		return s.newValue3(op, n.Type(), args[0], args[1], s.mem())
  2141  	}
  2142  }
  2143  
  2144  func simdMaskedStore(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2145  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2146  		s.vars[memVar] = s.newValue4A(op, types.TypeMem, args[0].Type, args[1], args[2], args[0], s.mem())
  2147  		return nil
  2148  	}
  2149  }
  2150  
  2151  // findIntrinsic returns a function which builds the SSA equivalent of the
  2152  // function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
  2153  func findIntrinsic(sym *types.Sym) intrinsicBuilder {
  2154  	if sym == nil || sym.Pkg == nil {
  2155  		return nil
  2156  	}
  2157  	pkg := sym.Pkg.Path
  2158  	if sym.Pkg == ir.Pkgs.Runtime {
  2159  		pkg = "runtime"
  2160  	}
  2161  	if base.Flag.Race && pkg == "sync/atomic" {
  2162  		// The race detector needs to be able to intercept these calls.
  2163  		// We can't intrinsify them.
  2164  		return nil
  2165  	}
  2166  	// Skip intrinsifying math functions (which may contain hard-float
  2167  	// instructions) when soft-float
  2168  	if Arch.SoftFloat && pkg == "math" {
  2169  		return nil
  2170  	}
  2171  
  2172  	fn := sym.Name
  2173  	if ssa.IntrinsicsDisable {
  2174  		if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") ||
  2175  			pkg == simdPackage {
  2176  			// These runtime functions don't have definitions, must be intrinsics.
  2177  		} else {
  2178  			return nil
  2179  		}
  2180  	}
  2181  	return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
  2182  }
  2183  
  2184  func IsIntrinsicCall(n *ir.CallExpr) bool {
  2185  	if n == nil {
  2186  		return false
  2187  	}
  2188  	name, ok := n.Fun.(*ir.Name)
  2189  	if !ok {
  2190  		if n.Fun.Op() == ir.OMETHEXPR {
  2191  			if meth := ir.MethodExprName(n.Fun); meth != nil {
  2192  				if fn := meth.Func; fn != nil {
  2193  					return IsIntrinsicSym(fn.Sym())
  2194  				}
  2195  			}
  2196  		}
  2197  		return false
  2198  	}
  2199  	return IsIntrinsicSym(name.Sym())
  2200  }
  2201  
  2202  func IsIntrinsicSym(sym *types.Sym) bool {
  2203  	return findIntrinsic(sym) != nil
  2204  }
  2205  
  2206  // GenIntrinsicBody generates the function body for a bodyless intrinsic.
  2207  // This is used when the intrinsic is used in a non-call context, e.g.
  2208  // as a function pointer, or (for a method) being referenced from the type
  2209  // descriptor.
  2210  //
  2211  // The compiler already recognizes a call to fn as an intrinsic and can
  2212  // directly generate code for it. So we just fill in the body with a call
  2213  // to fn.
  2214  func GenIntrinsicBody(fn *ir.Func) {
  2215  	if ir.CurFunc != nil {
  2216  		base.FatalfAt(fn.Pos(), "enqueueFunc %v inside %v", fn, ir.CurFunc)
  2217  	}
  2218  
  2219  	if base.Flag.LowerR != 0 {
  2220  		fmt.Println("generate intrinsic for", ir.FuncName(fn))
  2221  	}
  2222  
  2223  	pos := fn.Pos()
  2224  	ft := fn.Type()
  2225  	var ret ir.Node
  2226  
  2227  	// For a method, it usually starts with an ODOTMETH (pre-typecheck) or
  2228  	// OMETHEXPR (post-typecheck) referencing the method symbol without the
  2229  	// receiver type, and Walk rewrites it to a call directly to the
  2230  	// type-qualified method symbol, moving the receiver to an argument.
  2231  	// Here fn has already the type-qualified method symbol, and it is hard
  2232  	// to get the unqualified symbol. So we just generate the post-Walk form
  2233  	// and mark it typechecked and Walked.
  2234  	call := ir.NewCallExpr(pos, ir.OCALLFUNC, fn.Nname, nil)
  2235  	call.Args = ir.RecvParamNames(ft)
  2236  	call.IsDDD = ft.IsVariadic()
  2237  	typecheck.Exprs(call.Args)
  2238  	call.SetTypecheck(1)
  2239  	call.SetWalked(true)
  2240  	ret = call
  2241  	if ft.NumResults() > 0 {
  2242  		if ft.NumResults() == 1 {
  2243  			call.SetType(ft.Result(0).Type)
  2244  		} else {
  2245  			call.SetType(ft.ResultsTuple())
  2246  		}
  2247  		n := ir.NewReturnStmt(base.Pos, nil)
  2248  		n.Results = []ir.Node{call}
  2249  		ret = n
  2250  	}
  2251  	fn.Body.Append(ret)
  2252  
  2253  	if base.Flag.LowerR != 0 {
  2254  		ir.DumpList("generate intrinsic body", fn.Body)
  2255  	}
  2256  
  2257  	ir.CurFunc = fn
  2258  	typecheck.Stmts(fn.Body)
  2259  	ir.CurFunc = nil // we know CurFunc is nil at entry
  2260  }
  2261  

View as plain text