// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ignore

// mkpreempt generates the asyncPreempt functions for each
// architecture.
package main

import (
	"bytes"
	"flag"
	"fmt"
	"go/format"
	"io"
	"log"
	"os"
	"strings"
)

// Copied from cmd/compile/internal/ssa/gen/*Ops.go

var regNames386 = []string{
	"AX",
	"CX",
	"DX",
	"BX",
	"SP",
	"BP",
	"SI",
	"DI",
	"X0",
	"X1",
	"X2",
	"X3",
	"X4",
	"X5",
	"X6",
	"X7",
}

var regNamesAMD64 = []string{
	"AX",
	"CX",
	"DX",
	"BX",
	"SP",
	"BP",
	"SI",
	"DI",
	"R8",
	"R9",
	"R10",
	"R11",
	"R12",
	"R13",
	"R14",
	"R15",
	"X0",
	"X1",
	"X2",
	"X3",
	"X4",
	"X5",
	"X6",
	"X7",
	"X8",
	"X9",
	"X10",
	"X11",
	"X12",
	"X13",
	"X14",
	"X15",
}

var arches = map[string]func(g *gen){
	"386":     gen386,
	"amd64":   genAMD64,
	"arm":     genARM,
	"arm64":   genARM64,
	"loong64": genLoong64,
	"mips64x": func(g *gen) { genMIPS(g, true) },
	"mipsx":   func(g *gen) { genMIPS(g, false) },
	"ppc64x":  genPPC64,
	"riscv64": genRISCV64,
	"s390x":   genS390X,
	"wasm":    genWasm,
}
var beLe = map[string]bool{"mips64x": true, "mipsx": true, "ppc64x": true}

func main() {
	flag.Parse()
	if flag.NArg() > 0 {
		for _, arch := range flag.Args() {
			genFn, ok := arches[arch]
			if !ok {
				log.Fatalf("unknown arch %s", arch)
			}
			g := gen{os.Stdout, arch}
			g.asmHeader()
			genFn(&g)
		}
		return
	}

	for arch, genFn := range arches {
		f, err := os.Create(fmt.Sprintf("preempt_%s.s", arch))
		if err != nil {
			log.Fatal(err)
		}
		g := gen{f, arch}
		g.asmHeader()
		genFn(&g)
		if err := f.Close(); err != nil {
			log.Fatal(err)
		}
	}
}

type gen struct {
	w      io.Writer
	goarch string
}

func (g *gen) commonHeader() {
	fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
	if beLe[g.goarch] {
		base := g.goarch[:len(g.goarch)-1]
		fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
	}
}

func (g *gen) asmHeader() {
	g.commonHeader()
	fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
	if g.goarch == "amd64" {
		fmt.Fprintf(g.w, "#include \"go_tls.h\"\n")
		fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
	}
	fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
	fmt.Fprintf(g.w, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
}

func (g *gen) p(f string, args ...any) {
	fmted := fmt.Sprintf(f, args...)
	fmt.Fprintf(g.w, "\t%s\n", strings.ReplaceAll(fmted, "\n", "\n\t"))
}

func (g *gen) label(l string) {
	fmt.Fprintf(g.w, "%s\n", l)
}

// writeXRegs writes an architecture xregs file.
func writeXRegs(arch string, l *layout) {
	var code bytes.Buffer
	g := gen{&code, arch}
	g.commonHeader()
	fmt.Fprintf(g.w, `
package runtime

type xRegs struct {
`)
	pos := 0
	for _, seq := range l.regs {
		for _, r := range seq.regs {
			if r.pos != pos && !seq.fixedOffset {
				log.Fatalf("padding not implemented")
			}
			typ := fmt.Sprintf("[%d]byte", r.size)
			switch {
			case r.size == 4 && r.pos%4 == 0:
				typ = "uint32"
			case r.size == 8 && r.pos%8 == 0:
				typ = "uint64"
			}
			fmt.Fprintf(g.w, "\t%s %s\n", r.name, typ)
			pos += r.size
		}
	}
	fmt.Fprintf(g.w, "}\n")

	path := fmt.Sprintf("preempt_%s.go", arch)
	b, err := format.Source(code.Bytes())
	if err != nil {
		log.Fatalf("formatting %s: %s", path, err)
	}
	if err := os.WriteFile(path, b, 0666); err != nil {
		log.Fatal(err)
	}
}

type layout struct {
	stack int
	regs  []regSeq
	sp    string // stack pointer register
}

type regInfo struct {
	size int    // register size in bytes
	name string // register name

	// Some register names may require a specific suffix.
	// In ARM64, a suffix called an "arrangement specifier" can be added to
	// a register name. For example:
	//
	//	V0.B16
	//
	// In this case, "V0" is the register name, and ".B16" is the suffix.
	suffix string

	pos int // position on stack
}

// Some save/restore operations can involve multiple registers in a single
// instruction. For example, the LDP/STP instructions in ARM64:
//
//	LDP 8(RSP), (R0, R1)
//	STP (R0, R1), 8(RSP)
//
// In these cases, a pair of registers (R0, R1) is used as a single argument.
type regSeq struct {
	saveOp    string
	restoreOp string
	regs      []regInfo

	// By default, all registers are saved on the stack, and the stack pointer offset
	// is calculated based on the size of each register. For example (ARM64):
	//
	//   STP (R0, R1), 8(RSP)
	//   STP (R2, R3), 24(RSP)
	//
	// However, automatic offset calculation may not always be desirable.
	// In some cases, the offset must remain fixed:
	//
	//   VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
	//   VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
	//
	// In this example, R0 is post-incremented after each instruction,
	// so the offset should not be recalculated. For such cases,
	// `fixedOffset` is set to true.
	fixedOffset bool

	// After conversion to a string, register names are separated by commas
	// and may be wrapped in a custom pair of brackets. For example (ARM64):
	//
	//   (R0, R1) // wrapped in parentheses
	//   [V0.B16, V1.B16, V2.B16, V3.B16] // wrapped in square brackets
	brackets [2]string

	// If this register requires special save and restore, these
	// give those operations with a %d placeholder for the stack
	// offset.
	save, restore string
}

func (l *layout) add(op, regname string, size int) {
	l.regs = append(l.regs, regSeq{saveOp: op, restoreOp: op, regs: []regInfo{{size, regname, "", l.stack}}})
	l.stack += size
}

func (l *layout) add2(sop, rop string, regs []regInfo, brackets [2]string, fixedOffset bool) {
	l.regs = append(l.regs, regSeq{saveOp: sop, restoreOp: rop, regs: regs, brackets: brackets, fixedOffset: fixedOffset})
	if !fixedOffset {
		for i := range regs {
			regs[i].pos = l.stack
			l.stack += regs[i].size
		}
	}
}

func (l *layout) addSpecial(save, restore string, size int) {
	l.regs = append(l.regs, regSeq{save: save, restore: restore, regs: []regInfo{{size, "", "", l.stack}}})
	l.stack += size
}

func (rs *regSeq) String() string {
	switch len(rs.regs) {
	case 0:
		log.Fatal("Register sequence must not be empty!")
	case 1:
		return rs.regs[0].name
	default:
		names := make([]string, 0)
		for _, r := range rs.regs {
			name := r.name + r.suffix
			names = append(names, name)
		}
		return rs.brackets[0] + strings.Join(names, ", ") + rs.brackets[1]
	}
	return ""
}

func (l *layout) save(g *gen) {
	for _, seq := range l.regs {
		if len(seq.regs) < 1 {
			log.Fatal("Register sequence must not be empty!")
		}
		// When dealing with a sequence of registers, we assume that only the position
		// of the first register is relevant. For example:
		//
		//   STP (R0, R1), 8(RSP)
		//   STP (R2, R3), 24(RSP)
		//
		// Here, R0.pos is 8. While we can infer that R1.pos is 16, it doesn't need to
		// be explicitly specified, as the STP instruction calculates it automatically.
		pos := seq.regs[0].pos
		if seq.save != "" {
			g.p(seq.save, pos)
		} else {
			name := seq.String()
			g.p("%s %s, %d(%s)", seq.saveOp, name, pos, l.sp)
		}
	}
}

func (l *layout) restoreInOrder(g *gen, reverse bool) {
	var seq []regSeq
	if reverse {
		seq = make([]regSeq, 0)
		for i := len(l.regs) - 1; i >= 0; i-- {
			seq = append(seq, l.regs[i])
		}
	} else {
		seq = l.regs
	}
	for _, reg := range seq {
		if len(reg.regs) < 1 {
			log.Fatal("Register sequence must not be empty!")
		}
		pos := reg.regs[0].pos
		if reg.restore != "" {
			g.p(reg.restore, pos)
		} else {
			g.p("%s %d(%s), %s", reg.restoreOp, pos, l.sp, reg.String())
		}
	}
}

func (l *layout) restore(g *gen) {
	l.restoreInOrder(g, true)
}

func (l *layout) restoreDirect(g *gen) {
	l.restoreInOrder(g, false)
}

func gen386(g *gen) {
	p := g.p

	p("PUSHFL")
	// Save general purpose registers.
	var l = layout{sp: "SP"}
	for _, reg := range regNames386 {
		if reg == "SP" || strings.HasPrefix(reg, "X") {
			continue
		}
		l.add("MOVL", reg, 4)
	}

	softfloat := "GO386_softfloat"

	// Save SSE state only if supported.
	lSSE := layout{stack: l.stack, sp: "SP"}
	for i := 0; i < 8; i++ {
		lSSE.add("MOVUPS", fmt.Sprintf("X%d", i), 16)
	}

	p("ADJSP $%d", lSSE.stack)
	p("NOP SP")
	l.save(g)
	p("#ifndef %s", softfloat)
	lSSE.save(g)
	p("#endif")
	p("CALL ·asyncPreempt2(SB)")
	p("#ifndef %s", softfloat)
	lSSE.restore(g)
	p("#endif")
	l.restore(g)
	p("ADJSP $%d", -lSSE.stack)

	p("POPFL")
	p("RET")
}

func genAMD64(g *gen) {
	const xReg = "AX" // *xRegState

	p, label := g.p, g.label

	// Assign stack offsets.
	var l = layout{sp: "SP"}
	for _, reg := range regNamesAMD64 {
		if reg == "SP" || reg == "BP" {
			continue
		}
		if !strings.HasPrefix(reg, "X") {
			l.add("MOVQ", reg, 8)
		}
	}
	// Create layouts for X, Y, and Z registers.
	const (
		numXRegs = 16
		numZRegs = 32
		numKRegs = 8
	)
	lZRegs := layout{sp: xReg} // Non-GP registers
	lXRegs, lYRegs := lZRegs, lZRegs
	for i := range numZRegs {
		lZRegs.add("VMOVDQU64", fmt.Sprintf("Z%d", i), 512/8)
		if i < numXRegs {
			// Use SSE-only instructions for X registers.
			lXRegs.add("MOVUPS", fmt.Sprintf("X%d", i), 128/8)
			lYRegs.add("VMOVDQU", fmt.Sprintf("Y%d", i), 256/8)
		}
	}
	for i := range numKRegs {
		lZRegs.add("KMOVQ", fmt.Sprintf("K%d", i), 8)
	}
	// The Z layout is the most general, so we line up the others with that one.
	// We don't have to do this, but it results in a nice Go type. If we split
	// this into multiple types, we probably should stop doing this.
	for i := range lXRegs.regs {
		for j := range lXRegs.regs[i].regs {
			lXRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos
			lYRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos
		}

	}
	writeXRegs(g.goarch, &lZRegs)

	p("PUSHQ BP")
	p("MOVQ SP, BP")
	p("// Save flags before clobbering them")
	p("PUSHFQ")
	p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
	p("ADJSP $%d", l.stack)
	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
	p("NOP SP")

	p("// Save GPs")
	l.save(g)

	// In general, the limitations on asynchronous preemption mean we only
	// preempt in ABIInternal code. However, there's at least one exception to
	// this: when we're in an open-coded transition between an ABIInternal
	// function and an ABI0 call. We could more carefully arrange unsafe points
	// to avoid ever landing in ABI0, but it's easy to just make this code not
	// sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will
	// ensure we're in ABIInternal register state.
	p("// Save extended register state to p.xRegs.scratch")
	p("// Don't make assumptions about ABI register state. See mkpreempt.go")
	p("get_tls(CX)")
	p("MOVQ g(CX), R14")
	p("MOVQ g_m(R14), %s", xReg)
	p("MOVQ m_p(%s), %s", xReg, xReg)
	p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)

	// Which registers do we need to save?
	p("#ifdef GOEXPERIMENT_simd")
	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1")
	p("JE saveAVX512")
	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1")
	p("JE saveAVX2")
	p("#endif")

	// No features. Assume only SSE.
	label("saveSSE:")
	lXRegs.save(g)
	p("JMP preempt")

	label("saveAVX2:")
	lYRegs.save(g)
	p("JMP preempt")

	label("saveAVX512:")
	lZRegs.save(g)
	p("JMP preempt")

	label("preempt:")
	p("CALL ·asyncPreempt2(SB)")

	p("// Restore non-GPs from *p.xRegs.cache")
	p("MOVQ g_m(R14), %s", xReg)
	p("MOVQ m_p(%s), %s", xReg, xReg)
	p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)

	p("#ifdef GOEXPERIMENT_simd")
	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1")
	p("JE restoreAVX512")
	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1")
	p("JE restoreAVX2")
	p("#endif")

	label("restoreSSE:")
	lXRegs.restore(g)
	p("JMP restoreGPs")

	label("restoreAVX2:")
	lYRegs.restore(g)
	p("JMP restoreGPs")

	label("restoreAVX512:")
	lZRegs.restore(g)
	p("JMP restoreGPs")

	label("restoreGPs:")
	p("// Restore GPs")
	l.restore(g)
	p("ADJSP $%d", -l.stack)
	p("POPFQ")
	p("POPQ BP")
	p("RET")
}

func genARM(g *gen) {
	p := g.p

	// Add integer registers R0-R12.
	// R13 (SP), R14 (LR), R15 (PC) are special and not saved here.
	var l = layout{sp: "R13", stack: 4} // add LR slot
	for i := 0; i <= 12; i++ {
		reg := fmt.Sprintf("R%d", i)
		if i == 10 {
			continue // R10 is g register, no need to save/restore
		}
		l.add("MOVW", reg, 4)
	}
	// Add flag register.
	l.addSpecial(
		"MOVW CPSR, R0\nMOVW R0, %d(R13)",
		"MOVW %d(R13), R0\nMOVW R0, CPSR",
		4)

	// Add floating point registers F0-F15 and flag register.
	var lfp = layout{stack: l.stack, sp: "R13"}
	lfp.addSpecial(
		"MOVW FPCR, R0\nMOVW R0, %d(R13)",
		"MOVW %d(R13), R0\nMOVW R0, FPCR",
		4)
	for i := 0; i <= 15; i++ {
		reg := fmt.Sprintf("F%d", i)
		lfp.add("MOVD", reg, 8)
	}

	p("MOVW.W R14, -%d(R13)", lfp.stack) // allocate frame, save LR
	l.save(g)
	p("MOVB ·goarmsoftfp(SB), R0\nCMP $0, R0\nBNE nofp") // test goarmsoftfp, and skip FP registers if goarmsoftfp!=0.
	lfp.save(g)
	g.label("nofp:")
	p("CALL ·asyncPreempt2(SB)")
	p("MOVB ·goarmsoftfp(SB), R0\nCMP $0, R0\nBNE nofp2") // test goarmsoftfp, and skip FP registers if goarmsoftfp!=0.
	lfp.restore(g)
	g.label("nofp2:")
	l.restore(g)

	p("MOVW %d(R13), R14", lfp.stack)     // sigctxt.pushCall pushes LR on stack, restore it
	p("MOVW.P %d(R13), R15", lfp.stack+4) // load PC, pop frame (including the space pushed by sigctxt.pushCall)
	p("UNDEF")                            // shouldn't get here
}

func genARM64(g *gen) {
	const vReg = "R0" // *xRegState
	p := g.p
	// Add integer registers R0-R26
	// R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special
	// and not saved here.
	var l = layout{sp: "RSP", stack: 8} // add slot to save PC of interrupted instruction
	for i := 0; i < 26; i += 2 {
		if i == 18 {
			i--
			continue // R18 is not used, skip
		}
		regs := []regInfo{
			{name: fmt.Sprintf("R%d", i), size: 8},
			{name: fmt.Sprintf("R%d", i+1), size: 8},
		}
		l.add2("STP", "LDP", regs, [2]string{"(", ")"}, false)
	}
	// Add flag registers.
	l.addSpecial(
		"MOVD NZCV, R0\nMOVD R0, %d(RSP)",
		"MOVD %d(RSP), R0\nMOVD R0, NZCV",
		8)
	l.addSpecial(
		"MOVD FPSR, R0\nMOVD R0, %d(RSP)",
		"MOVD %d(RSP), R0\nMOVD R0, FPSR",
		8)
	// TODO: FPCR? I don't think we'll change it, so no need to save.
	// Add floating point registers F0-F31.
	lVRegs := layout{sp: vReg} // Non-GP registers
	for i := 0; i < 31; i += 4 {
		regs := []regInfo{
			{name: fmt.Sprintf("V%d", i), suffix: ".B16", size: 16, pos: 64},
			{name: fmt.Sprintf("V%d", i+1), suffix: ".B16", size: 16, pos: 64},
			{name: fmt.Sprintf("V%d", i+2), suffix: ".B16", size: 16, pos: 64},
			{name: fmt.Sprintf("V%d", i+3), suffix: ".B16", size: 16, pos: 64},
		}
		lVRegs.add2("VST1.P", "VLD1.P", regs, [2]string{"[", "]"}, true)
	}
	writeXRegs(g.goarch, &lVRegs)
	if l.stack%16 != 0 {
		l.stack += 8 // SP needs 16-byte alignment
	}

	// allocate frame, save PC of interrupted instruction (in LR)
	p("MOVD R30, %d(RSP)", -l.stack)
	p("SUB $%d, RSP", l.stack)
	p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
	p("SUB $8, RSP, R29")  // set up new frame pointer
	// On iOS, save the LR again after decrementing SP. We run the
	// signal handler on the G stack (as it doesn't support sigaltstack),
	// so any writes below SP may be clobbered.
	p("#ifdef GOOS_ios")
	p("MOVD R30, (RSP)")
	p("#endif")

	p("// Save GPs")
	l.save(g)
	p("// Save extended register state to p.xRegs.scratch")
	p("MOVD g_m(g), %s", vReg)
	p("MOVD m_p(%s), %s", vReg, vReg)
	p("ADD $(p_xRegs+xRegPerP_scratch), %s, %s", vReg, vReg)
	lVRegs.save(g)
	p("CALL ·asyncPreempt2(SB)")
	p("// Restore non-GPs from *p.xRegs.cache")
	p("MOVD g_m(g), %s", vReg)
	p("MOVD m_p(%s), %s", vReg, vReg)
	p("MOVD (p_xRegs+xRegPerP_cache)(%s), %s", vReg, vReg)
	lVRegs.restoreDirect(g)
	p("// Restore GPs")
	l.restore(g)

	p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
	p("MOVD -8(RSP), R29")          // restore frame pointer
	p("MOVD (RSP), R27")            // load PC to REGTMP
	p("ADD $%d, RSP", l.stack+16)   // pop frame (including the space pushed by sigctxt.pushCall)
	p("RET (R27)")
}

func genMIPS(g *gen, _64bit bool) {
	p := g.p

	mov := "MOVW"
	movf := "MOVF"
	add := "ADD"
	sub := "SUB"
	r28 := "R28"
	regsize := 4
	softfloat := "GOMIPS_softfloat"
	if _64bit {
		mov = "MOVV"
		movf = "MOVD"
		add = "ADDV"
		sub = "SUBV"
		r28 = "RSB"
		regsize = 8
		softfloat = "GOMIPS64_softfloat"
	}

	// Add integer registers R1-R22, R24-R25, R28
	// R0 (zero), R23 (REGTMP), R29 (SP), R30 (g), R31 (LR) are special,
	// and not saved here. R26 and R27 are reserved by kernel and not used.
	var l = layout{sp: "R29", stack: regsize} // add slot to save PC of interrupted instruction (in LR)
	for i := 1; i <= 25; i++ {
		if i == 23 {
			continue // R23 is REGTMP
		}
		reg := fmt.Sprintf("R%d", i)
		l.add(mov, reg, regsize)
	}
	l.add(mov, r28, regsize)
	l.addSpecial(
		mov+" HI, R1\n"+mov+" R1, %d(R29)",
		mov+" %d(R29), R1\n"+mov+" R1, HI",
		regsize)
	l.addSpecial(
		mov+" LO, R1\n"+mov+" R1, %d(R29)",
		mov+" %d(R29), R1\n"+mov+" R1, LO",
		regsize)

	// Add floating point control/status register FCR31 (FCR0-FCR30 are irrelevant)
	var lfp = layout{sp: "R29", stack: l.stack}
	lfp.addSpecial(
		mov+" FCR31, R1\n"+mov+" R1, %d(R29)",
		mov+" %d(R29), R1\n"+mov+" R1, FCR31",
		regsize)
	// Add floating point registers F0-F31.
	for i := 0; i <= 31; i++ {
		reg := fmt.Sprintf("F%d", i)
		lfp.add(movf, reg, regsize)
	}

	// allocate frame, save PC of interrupted instruction (in LR)
	p(mov+" R31, -%d(R29)", lfp.stack)
	p(sub+" $%d, R29", lfp.stack)

	l.save(g)
	p("#ifndef %s", softfloat)
	lfp.save(g)
	p("#endif")
	p("CALL ·asyncPreempt2(SB)")
	p("#ifndef %s", softfloat)
	lfp.restore(g)
	p("#endif")
	l.restore(g)

	p(mov+" %d(R29), R31", lfp.stack)     // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
	p(mov + " (R29), R23")                // load PC to REGTMP
	p(add+" $%d, R29", lfp.stack+regsize) // pop frame (including the space pushed by sigctxt.pushCall)
	p("JMP (R23)")
}

func genLoong64(g *gen) {
	const xReg = "R4" // *xRegState

	p, label := g.p, g.label

	mov := "MOVV"
	add := "ADDV"
	sub := "SUBV"
	regsize := 8

	// Add integer registers r4-r21 r23-r29 r31
	// R0 (zero), R30 (REGTMP), R2 (tp), R3 (SP), R22 (g), R1 (LR) are special,
	var l = layout{sp: "R3", stack: regsize} // add slot to save PC of interrupted instruction (in LR)
	for i := 4; i <= 31; i++ {
		if i == 22 || i == 30 {
			continue
		}
		reg := fmt.Sprintf("R%d", i)
		l.add(mov, reg, regsize)
	}

	// Add condition flag register fcc0-fcc7
	sv := ""
	rs := ""
	last := 7
	for i := 0; i <= last; i++ {
		msb := 7 + (i * 8)
		lsb := 0 + (i * 8)

		// MOVV FCCx, R4,
		// BSTRINSV $msb, R4, $lsb, R5
		sv += fmt.Sprintf("%s FCC%d, R4\n", mov, i)
		sv += fmt.Sprintf("BSTRINSV $%d, R4, $%d, R5\n", msb, lsb)

		// BSTRPICKV $msb, R5, $lsb, R4
		// MOVV R4, FCCx
		rs += fmt.Sprintf("BSTRPICKV $%d, R5, $%d, R4\n", msb, lsb)
		rs += fmt.Sprintf("%s R4, FCC%d", mov, i)
		if i != last {
			rs += fmt.Sprintf("\n")
		}
	}
	l.addSpecial(
		sv+mov+" R5, %d(R3)",
		mov+" %d(R3), R5\n"+rs,
		regsize)

	// Create layouts for lasx, lsx and fp registers.
	lasxRegs := layout{sp: xReg}
	lsxRegs := lasxRegs
	fpRegs := lasxRegs
	for i := 0; i <= 31; i++ {
		lasxRegs.add("XVMOVQ", fmt.Sprintf("X%d", i), 256/8)
		lsxRegs.add("VMOVQ", fmt.Sprintf("V%d", i), 128/8)
		fpRegs.add("MOVD", fmt.Sprintf("F%d", i), 64/8)
	}

	for i := range lsxRegs.regs {
		for j := range lsxRegs.regs[i].regs {
			lsxRegs.regs[i].regs[j].pos = lasxRegs.regs[i].regs[j].pos
			fpRegs.regs[i].regs[j].pos = lasxRegs.regs[i].regs[j].pos
		}
	}
	writeXRegs(g.goarch, &lasxRegs)

	// allocate frame, save PC of interrupted instruction (in LR)
	p(mov+" R1, -%d(R3)", l.stack)
	p(sub+" $%d, R3", l.stack)

	p("// Save GPs")
	l.save(g)

	p("// Save extended register state to p.xRegs.scratch")
	p("MOVV g_m(g), %s", xReg)
	p("MOVV m_p(%s), %s", xReg, xReg)
	p("ADDV $(p_xRegs+xRegPerP_scratch), %s, %s", xReg, xReg)

	p("MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R5")
	p("BNE R5, saveLASX")

	p("MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R5")
	p("BNE R5, saveLSX")

	label("saveFP:")
	fpRegs.save(g)
	p("JMP preempt")

	label("saveLSX:")
	lsxRegs.save(g)
	p("JMP preempt")

	label("saveLASX:")
	lasxRegs.save(g)

	label("preempt:")
	p("CALL ·asyncPreempt2(SB)")

	p("// Restore non-GPs from *p.xRegs.cache")
	p("MOVV g_m(g), %s", xReg)
	p("MOVV m_p(%s), %s", xReg, xReg)
	p("MOVV (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)

	p("MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R5")
	p("BNE R5, restoreLASX")

	p("MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R5")
	p("BNE R5, restoreLSX")

	label("restoreFP:")
	fpRegs.restore(g)
	p("JMP restoreGPs")

	label("restoreLSX:")
	lsxRegs.restore(g)
	p("JMP restoreGPs")

	label("restoreLASX:")
	lasxRegs.restore(g)

	p("// Restore GPs")
	label("restoreGPs:")
	l.restore(g)

	p(mov+" %d(R3), R1", l.stack)      // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
	p(mov + " (R3), R30")              // load PC to REGTMP
	p(add+" $%d, R3", l.stack+regsize) // pop frame (including the space pushed by sigctxt.pushCall)
	p("JMP (R30)")
}

func genPPC64(g *gen) {
	p := g.p

	// Add integer registers R3-R29
	// R0 (zero), R1 (SP), R30 (g) are special and not saved here.
	// R2 (TOC pointer in PIC mode), R12 (function entry address in PIC mode) have been saved in sigctxt.pushCall.
	// R31 (REGTMP) will be saved manually.
	var l = layout{sp: "R1", stack: 32 + 8} // MinFrameSize on PPC64, plus one word for saving R31
	for i := 3; i <= 29; i++ {
		if i == 12 || i == 13 {
			// R12 has been saved in sigctxt.pushCall.
			// R13 is TLS pointer, not used by Go code. we must NOT
			// restore it, otherwise if we parked and resumed on a
			// different thread we'll mess up TLS addresses.
			continue
		}
		reg := fmt.Sprintf("R%d", i)
		l.add("MOVD", reg, 8)
	}
	l.addSpecial(
		"MOVW CR, R31\nMOVW R31, %d(R1)",
		"MOVW %d(R1), R31\nMOVFL R31, $0xff", // this is MOVW R31, CR
		8)                                    // CR is 4-byte wide, but just keep the alignment
	l.addSpecial(
		"MOVD XER, R31\nMOVD R31, %d(R1)",
		"MOVD %d(R1), R31\nMOVD R31, XER",
		8)
	// Add floating point registers F0-F31.
	for i := 0; i <= 31; i++ {
		reg := fmt.Sprintf("F%d", i)
		l.add("FMOVD", reg, 8)
	}
	// Add floating point control/status register FPSCR.
	l.addSpecial(
		"MOVFL FPSCR, F0\nFMOVD F0, %d(R1)",
		"FMOVD %d(R1), F0\nMOVFL F0, FPSCR",
		8)

	p("MOVD R31, -%d(R1)", l.stack-32) // save R31 first, we'll use R31 for saving LR
	p("MOVD LR, R31")
	p("MOVDU R31, -%d(R1)", l.stack) // allocate frame, save PC of interrupted instruction (in LR)

	l.save(g)
	p("CALL ·asyncPreempt2(SB)")
	l.restore(g)

	p("MOVD %d(R1), R31", l.stack) // sigctxt.pushCall has pushed LR, R2, R12 (at interrupt) on stack, restore them
	p("MOVD R31, LR")
	p("MOVD %d(R1), R2", l.stack+8)
	p("MOVD %d(R1), R12", l.stack+16)
	p("MOVD (R1), R31") // load PC to CTR
	p("MOVD R31, CTR")
	p("MOVD 32(R1), R31")        // restore R31
	p("ADD $%d, R1", l.stack+32) // pop frame (including the space pushed by sigctxt.pushCall)
	p("JMP (CTR)")
}

func genRISCV64(g *gen) {
	p := g.p

	// X0 (zero), X1 (LR), X2 (SP), X3 (GP), X4 (TP), X27 (g), X31 (TMP) are special.
	var l = layout{sp: "X2", stack: 8}

	// Add integer registers (X5-X26, X28-30).
	for i := 5; i < 31; i++ {
		if i == 27 {
			continue
		}
		reg := fmt.Sprintf("X%d", i)
		l.add("MOV", reg, 8)
	}

	// Add floating point registers (F0-F31).
	for i := 0; i <= 31; i++ {
		reg := fmt.Sprintf("F%d", i)
		l.add("MOVD", reg, 8)
	}

	p("MOV X1, -%d(X2)", l.stack)
	p("SUB $%d, X2", l.stack)
	l.save(g)
	p("CALL ·asyncPreempt2(SB)")
	l.restore(g)
	p("MOV %d(X2), X1", l.stack)
	p("MOV (X2), X31")
	p("ADD $%d, X2", l.stack+8)
	p("JMP (X31)")
}

func genS390X(g *gen) {
	p := g.p

	// Add integer registers R0-R12
	// R13 (g), R14 (LR), R15 (SP) are special, and not saved here.
	// Saving R10 (REGTMP) is not necessary, but it is saved anyway.
	var l = layout{sp: "R15", stack: 16} // add slot to save PC of interrupted instruction and flags
	l.addSpecial(
		"STMG R0, R12, %d(R15)",
		"LMG %d(R15), R0, R12",
		13*8)
	// Add floating point registers F0-F31.
	for i := 0; i <= 15; i++ {
		reg := fmt.Sprintf("F%d", i)
		l.add("FMOVD", reg, 8)
	}

	// allocate frame, save PC of interrupted instruction (in LR) and flags (condition code)
	p("IPM R10") // save flags upfront, as ADD will clobber flags
	p("MOVD R14, -%d(R15)", l.stack)
	p("ADD $-%d, R15", l.stack)
	p("MOVW R10, 8(R15)") // save flags

	l.save(g)
	p("CALL ·asyncPreempt2(SB)")
	l.restore(g)

	p("MOVD %d(R15), R14", l.stack)    // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
	p("ADD $%d, R15", l.stack+8)       // pop frame (including the space pushed by sigctxt.pushCall)
	p("MOVWZ -%d(R15), R10", l.stack)  // load flags to REGTMP
	p("TMLH R10, $(3<<12)")            // restore flags
	p("MOVD -%d(R15), R10", l.stack+8) // load PC to REGTMP
	p("JMP (R10)")
}

func genWasm(g *gen) {
	p := g.p
	p("// No async preemption on wasm")
	p("UNDEF")
}

func notImplemented(g *gen) {
	p := g.p
	p("// Not implemented yet")
	p("JMP ·abort(SB)")
}
