// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package asmgen

// mulAddVWW generates mulAddVWW, which does z, c = x*m + a.
func mulAddVWW(a *Asm) {
	f := a.Func("func mulAddVWW(z, x []Word, m, a Word) (c Word)")

	if a.AltCarry().Valid() {
		addMulVirtualCarry(f, 0)
		return
	}
	addMul(f, "", "x", 0)
}

// addMulVVWW generates addMulVVWW which does z, c = x + y*m + a.
// (A more pedantic name would be addMulAddVVWW.)
func addMulVVWW(a *Asm) {
	f := a.Func("func addMulVVWW(z, x, y []Word, m, a Word) (c Word)")

	// If the architecture has virtual carries, emit that version unconditionally.
	if a.AltCarry().Valid() {
		addMulVirtualCarry(f, 1)
		return
	}

	// If the architecture optionally has two carries, test and emit both versions.
	if a.JmpEnable(OptionAltCarry, "altcarry") {
		regs := a.RegsUsed()
		addMul(f, "x", "y", 1)
		a.Label("altcarry")
		a.SetOption(OptionAltCarry, true)
		a.SetRegsUsed(regs)
		addMulAlt(f)
		a.SetOption(OptionAltCarry, false)
		return
	}

	// Otherwise emit the one-carry form.
	addMul(f, "x", "y", 1)
}

// Computing z = addsrc + m*mulsrc + a, we need:
//
//	for i := range z {
//		lo, hi := m * mulsrc[i]
//		lo, carry = bits.Add(lo, a, 0)
//		lo, carryAlt = bits.Add(lo, addsrc[i], 0)
//		z[i] = lo
//		a = hi + carry + carryAlt  // cannot overflow
//	}
//
// The final addition cannot overflow because after processing N words,
// the maximum possible value is (for a 64-bit system):
//
//	  (2**64N - 1) + (2**64 - 1)*(2**64N - 1) + (2**64 - 1)
//	= (2**64)*(2**64N - 1) + (2**64 - 1)
//	= 2**64(N+1) - 1,
//
// which fits in N+1 words (the high order one being the new value of a).
//
// (For example, with 3 decimal words, 999 + 9*999 + 9 = 999*10 + 9 = 9999.)
//
// If we unroll the loop a bit, then we can chain the carries in two passes.
// Consider:
//
//	lo0, hi0 := m * mulsrc[i]
//	lo0, carry = bits.Add(lo0, a, 0)
//	lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
//	z[i] = lo0
//	a = hi + carry + carryAlt // cannot overflow
//
//	lo1, hi1 := m * mulsrc[i]
//	lo1, carry = bits.Add(lo1, a, 0)
//	lo1, carryAlt = bits.Add(lo1, addsrc[i], 0)
//	z[i] = lo1
//	a = hi + carry + carryAlt // cannot overflow
//
//	lo2, hi2 := m * mulsrc[i]
//	lo2, carry = bits.Add(lo2, a, 0)
//	lo2, carryAlt = bits.Add(lo2, addsrc[i], 0)
//	z[i] = lo2
//	a = hi + carry + carryAlt // cannot overflow
//
//	lo3, hi3 := m * mulsrc[i]
//	lo3, carry = bits.Add(lo3, a, 0)
//	lo3, carryAlt = bits.Add(lo3, addsrc[i], 0)
//	z[i] = lo3
//	a = hi + carry + carryAlt // cannot overflow
//
// There are three ways we can optimize this sequence.
//
// (1) Reordering, we can chain carries so that we can use one hardware carry flag
// but amortize the cost of saving and restoring it across multiple instructions:
//
//	// multiply
//	lo0, hi0 := m * mulsrc[i]
//	lo1, hi1 := m * mulsrc[i+1]
//	lo2, hi2 := m * mulsrc[i+2]
//	lo3, hi3 := m * mulsrc[i+3]
//
//	lo0, carry = bits.Add(lo0, a, 0)
//	lo1, carry = bits.Add(lo1, hi0, carry)
//	lo2, carry = bits.Add(lo2, hi1, carry)
//	lo3, carry = bits.Add(lo3, hi2, carry)
//	a = hi3 + carry // cannot overflow
//
//	// add
//	lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
//	lo1, carryAlt = bits.Add(lo1, addsrc[i+1], carryAlt)
//	lo2, carryAlt = bits.Add(lo2, addsrc[i+2], carryAlt)
//	lo3, carryAlt = bits.Add(lo3, addrsc[i+3], carryAlt)
//	a = a + carryAlt // cannot overflow
//
//	z[i] = lo0
//	z[i+1] = lo1
//	z[i+2] = lo2
//	z[i+3] = lo3
//
// addMul takes this approach, using the hardware carry flag
// first for carry and then for carryAlt.
//
// (2) addMulAlt assumes there are two hardware carry flags available.
// It dedicates one each to carry and carryAlt, so that a multi-block
// unrolling can keep the flags in hardware across all the blocks.
// So even if the block size is 1, the code can do:
//
//	// multiply and add
//	lo0, hi0 := m * mulsrc[i]
//	lo0, carry = bits.Add(lo0, a, 0)
//	lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
//	z[i] = lo0
//
//	lo1, hi1 := m * mulsrc[i+1]
//	lo1, carry = bits.Add(lo1, hi0, carry)
//	lo1, carryAlt = bits.Add(lo1, addsrc[i+1], carryAlt)
//	z[i+1] = lo1
//
//	lo2, hi2 := m * mulsrc[i+2]
//	lo2, carry = bits.Add(lo2, hi1, carry)
//	lo2, carryAlt = bits.Add(lo2, addsrc[i+2], carryAlt)
//	z[i+2] = lo2
//
//	lo3, hi3 := m * mulsrc[i+3]
//	lo3, carry = bits.Add(lo3, hi2, carry)
//	lo3, carryAlt = bits.Add(lo3, addrsc[i+3], carryAlt)
//	z[i+3] = lo2
//
//	a = hi3 + carry + carryAlt // cannot overflow
//
// (3) addMulVirtualCarry optimizes for systems with explicitly computed carry bits
// (loong64, mips, riscv64), cutting the number of actual instructions almost by half.
// Look again at the original word-at-a-time version:
//
//	lo1, hi1 := m * mulsrc[i]
//	lo1, carry = bits.Add(lo1, a, 0)
//	lo1, carryAlt = bits.Add(lo1, addsrc[i], 0)
//	z[i] = lo1
//	a = hi + carry + carryAlt // cannot overflow
//
// Although it uses four adds per word, those are cheap adds: the two bits.Add adds
// use two instructions each (ADD+SLTU) and the final + adds only use one ADD each,
// for a total of 6 instructions per word. In contrast, the middle stanzas in (2) use
// only two “adds” per word, but these are SetCarry|UseCarry adds, which compile to
// five instruction each, for a total of 10 instructions per word. So the word-at-a-time
// loop is actually better. And we can reorder things slightly to use only a single carry bit:
//
//	lo1, hi1 := m * mulsrc[i]
//	lo1, carry = bits.Add(lo1, a, 0)
//	a = hi + carry
//	lo1, carry = bits.Add(lo1, addsrc[i], 0)
//	a = a + carry
//	z[i] = lo1
func addMul(f *Func, addsrc, mulsrc string, mulIndex int) {
	a := f.Asm
	mh := HintNone
	if a.Arch == Arch386 && addsrc != "" {
		mh = HintMemOK // too few registers otherwise
	}
	m := f.ArgHint("m", mh)
	c := f.Arg("a")
	n := f.Arg("z_len")

	p := f.Pipe()
	if addsrc != "" {
		p.SetHint(addsrc, HintMemOK)
	}
	p.SetHint(mulsrc, HintMulSrc)
	unroll := []int{1, 4}
	switch a.Arch {
	case Arch386:
		unroll = []int{1} // too few registers
	case ArchARM:
		p.SetMaxColumns(2) // too few registers (but more than 386)
	case ArchARM64:
		unroll = []int{1, 8} // 5% speedup on c4as16
	}

	// See the large comment above for an explanation of the code being generated.
	// This is optimization strategy 1.
	p.Start(n, unroll...)
	p.Loop(func(in, out [][]Reg) {
		a.Comment("multiply")
		prev := c
		flag := SetCarry
		for i, x := range in[mulIndex] {
			hi := a.RegHint(HintMulHi)
			a.MulWide(m, x, x, hi)
			a.Add(prev, x, x, flag)
			flag = UseCarry | SetCarry
			if prev != c {
				a.Free(prev)
			}
			out[0][i] = x
			prev = hi
		}
		a.Add(a.Imm(0), prev, c, UseCarry|SmashCarry)
		if addsrc != "" {
			a.Comment("add")
			flag := SetCarry
			for i, x := range in[0] {
				a.Add(x, out[0][i], out[0][i], flag)
				flag = UseCarry | SetCarry
			}
			a.Add(a.Imm(0), c, c, UseCarry|SmashCarry)
		}
		p.StoreN(out)
	})

	f.StoreArg(c, "c")
	a.Ret()
}

func addMulAlt(f *Func) {
	a := f.Asm
	m := f.ArgHint("m", HintMulSrc)
	c := f.Arg("a")
	n := f.Arg("z_len")

	// On amd64, we need a non-immediate for the AtUnrollEnd adds.
	r0 := a.ZR()
	if !r0.Valid() {
		r0 = a.Reg()
		a.Mov(a.Imm(0), r0)
	}

	p := f.Pipe()
	p.SetLabel("alt")
	p.SetHint("x", HintMemOK)
	p.SetHint("y", HintMemOK)
	if a.Arch == ArchAMD64 {
		p.SetMaxColumns(2)
	}

	// See the large comment above for an explanation of the code being generated.
	// This is optimization strategy (2).
	var hi Reg
	prev := c
	p.Start(n, 1, 8)
	p.AtUnrollStart(func() {
		a.Comment("multiply and add")
		a.ClearCarry(AddCarry | AltCarry)
		a.ClearCarry(AddCarry)
		hi = a.Reg()
	})
	p.AtUnrollEnd(func() {
		a.Add(r0, prev, c, UseCarry|SmashCarry)
		a.Add(r0, c, c, UseCarry|SmashCarry|AltCarry)
		prev = c
	})
	p.Loop(func(in, out [][]Reg) {
		for i, y := range in[1] {
			x := in[0][i]
			lo := y
			if lo.IsMem() {
				lo = a.Reg()
			}
			a.MulWide(m, y, lo, hi)
			a.Add(prev, lo, lo, UseCarry|SetCarry)
			a.Add(x, lo, lo, UseCarry|SetCarry|AltCarry)
			out[0][i] = lo
			prev, hi = hi, prev
		}
		p.StoreN(out)
	})

	f.StoreArg(c, "c")
	a.Ret()
}

func addMulVirtualCarry(f *Func, mulIndex int) {
	a := f.Asm
	m := f.Arg("m")
	c := f.Arg("a")
	n := f.Arg("z_len")

	// See the large comment above for an explanation of the code being generated.
	// This is optimization strategy (3).
	p := f.Pipe()
	p.Start(n, 1, 4)
	p.Loop(func(in, out [][]Reg) {
		a.Comment("synthetic carry, one column at a time")
		lo, hi := a.Reg(), a.Reg()
		for i, x := range in[mulIndex] {
			a.MulWide(m, x, lo, hi)
			if mulIndex == 1 {
				a.Add(in[0][i], lo, lo, SetCarry)
				a.Add(a.Imm(0), hi, hi, UseCarry|SmashCarry)
			}
			a.Add(c, lo, x, SetCarry)
			a.Add(a.Imm(0), hi, c, UseCarry|SmashCarry)
			out[0][i] = x
		}
		p.StoreN(out)
	})
	f.StoreArg(c, "c")
	a.Ret()
}