// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package asmgen

import (
	"fmt"
	"math/bits"
	"slices"
)

// Note: Exported fields and methods are expected to be used
// by function generators (like the ones in add.go and so on).
// Unexported fields and methods should not be.

// A Pipe manages the input and output data pipelines for a function's
// memory operations.
//
// The input is one or more equal-length slices of words, so collectively
// it can be viewed as a matrix, in which each slice is a row and each column
// is a set of corresponding words from the different slices.
// The output can be viewed the same way, although it is often just one row.
type Pipe struct {
	f               *Func    // function being generated
	label           string   // prefix for loop labels (default "loop")
	backward        bool     // processing columns in reverse
	started         bool     // Start has been called
	loaded          bool     // LoadPtrs has been called
	inPtr           []RegPtr // input slice pointers
	hints           []Hint   // for each inPtr, a register hint to use for its data
	outPtr          []RegPtr // output slice pointers
	index           Reg      // index register, if in use
	useIndexCounter bool     // index counter requested
	indexCounter    int      // index is also counter (386); 0 no, -1 negative counter, +1 positive counter
	readOff         int      // read offset not yet added to index
	writeOff        int      // write offset not yet added to index
	factors         []int    // unrolling factors
	counts          []Reg    // iterations for each factor
	needWrite       bool     // need a write call during Loop1/LoopN
	maxColumns      int      // maximum columns during unrolled loop
	unrollStart     func()   // emit code at start of unrolled body
	unrollEnd       func()   // emit code end of unrolled body
}

// Pipe creates and returns a new pipe for use in the function f.
func (f *Func) Pipe() *Pipe {
	a := f.Asm
	p := &Pipe{
		f:          f,
		label:      "loop",
		maxColumns: 10000000,
	}
	if m := a.Arch.maxColumns; m != 0 {
		p.maxColumns = m
	}
	return p
}

// SetBackward sets the pipe to process the input and output columns in reverse order.
// This is needed for left shifts, which might otherwise overwrite data they will read later.
func (p *Pipe) SetBackward() {
	if p.loaded {
		p.f.Asm.Fatalf("SetBackward after Start/LoadPtrs")
	}
	p.backward = true
}

// SetUseIndexCounter sets the pipe to use an index counter if possible,
// meaning the loop counter is also used as an index for accessing the slice data.
// This clever trick is slower on modern processors, but it is still necessary on 386.
// On non-386 systems, SetUseIndexCounter is a no-op.
func (p *Pipe) SetUseIndexCounter() {
	if p.f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it)
		return
	}
	p.useIndexCounter = true
}

// SetLabel sets the label prefix for the loops emitted by the pipe.
// The default prefix is "loop".
func (p *Pipe) SetLabel(label string) {
	p.label = label
}

// SetMaxColumns sets the maximum number of
// columns processed in a single loop body call.
func (p *Pipe) SetMaxColumns(m int) {
	p.maxColumns = m
}

// SetHint records that the inputs from the named vector
// should be allocated with the given register hint.
//
// If the hint indicates a single register on the target architecture,
// then SetHint calls SetMaxColumns(1), since the hinted register
// can only be used for one value at a time.
func (p *Pipe) SetHint(name string, hint Hint) {
	if hint == HintMemOK && !p.f.Asm.Arch.memOK {
		return
	}
	i := slices.Index(p.f.inputs, name)
	if i < 0 {
		p.f.Asm.Fatalf("unknown input name %s", name)
	}
	if p.f.Asm.hint(hint) != "" {
		p.SetMaxColumns(1)
	}
	for len(p.hints) <= i {
		p.hints = append(p.hints, HintNone)
	}
	p.hints[i] = hint
}

// LoadPtrs loads the slice pointer arguments into registers,
// assuming that the slice length n has already been loaded
// into the register n.
//
// Start will call LoadPtrs if it has not been called already.
// LoadPtrs only needs to be called explicitly when code needs
// to use LoadN before Start, like when the shift.go generators
// read an initial word before the loop.
func (p *Pipe) LoadPtrs(n Reg) {
	a := p.f.Asm
	if p.loaded {
		a.Fatalf("pointers already loaded")
	}

	// Load the actual pointers.
	p.loaded = true
	for _, name := range p.f.inputs {
		p.inPtr = append(p.inPtr, RegPtr(p.f.Arg(name+"_base")))
	}
	for _, name := range p.f.outputs {
		p.outPtr = append(p.outPtr, RegPtr(p.f.Arg(name+"_base")))
	}

	// Decide the memory access strategy for LoadN and StoreN.
	switch {
	case p.backward && p.useIndexCounter:
		// Generator wants an index counter, meaning when the iteration counter
		// is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes).
		// The loop is moving backward through the slice, but the counter
		// is also moving backward, so not much to do.
		a.Comment("run loop backward, using counter as positive index")
		p.indexCounter = +1
		p.index = n

	case !p.backward && p.useIndexCounter:
		// Generator wants an index counter, but the loop is moving forward.
		// To make the counter move in the direction of data access,
		// we negate the counter, counting up from -len(z) to -1.
		// To make the index access the right words, we add len(z)*WordBytes
		// to each of the pointers.
		// See comment below about the garbage collector (non-)implications
		// of pointing beyond the slice bounds.
		a.Comment("use counter as negative index")
		p.indexCounter = -1
		p.index = n
		for _, ptr := range p.inPtr {
			a.AddWords(n, ptr, ptr)
		}
		for _, ptr := range p.outPtr {
			a.AddWords(n, ptr, ptr)
		}
		a.Neg(n, n)

	case p.backward:
		// Generator wants to run the loop backward.
		// We'll decrement the pointers before using them,
		// so position them at the very end of the slices.
		// If we had precise pointer information for assembly,
		// these pointers would cause problems with the garbage collector,
		// since they no longer point into the allocated slice,
		// but the garbage collector ignores unexpected values in assembly stacks,
		// and the actual slice pointers are still in the argument stack slots,
		// so the slices won't be collected early.
		// If we switched to the register ABI, we might have to rethink this.
		// (The same thing happens by the end of forward loops,
		// but it's less important since once the pointers go off the slice
		// in a forward loop, the loop is over and the slice won't be accessed anymore.)
		a.Comment("run loop backward")
		for _, ptr := range p.inPtr {
			a.AddWords(n, ptr, ptr)
		}
		for _, ptr := range p.outPtr {
			a.AddWords(n, ptr, ptr)
		}

	case !p.backward:
		// Nothing to do!
	}
}

// LoadN returns the next n columns of input words as a slice of rows.
// Regs for inputs that have been marked using p.SetMemOK will be direct memory references.
// Regs for other inputs will be newly allocated registers and must be freed.
func (p *Pipe) LoadN(n int) [][]Reg {
	a := p.f.Asm
	regs := make([][]Reg, len(p.inPtr))
	for i, ptr := range p.inPtr {
		regs[i] = make([]Reg, n)
		switch {
		case a.Arch.loadIncN != nil:
			// Load from memory and advance pointers at the same time.
			for j := range regs[i] {
				regs[i][j] = p.f.Asm.Reg()
			}
			if p.backward {
				a.Arch.loadDecN(a, ptr, regs[i])
			} else {
				a.Arch.loadIncN(a, ptr, regs[i])
			}

		default:
			// Load from memory using offsets.
			// We'll advance the pointers or the index counter later.
			for j := range n {
				off := p.readOff + j
				if p.backward {
					off = -(off + 1)
				}
				var mem Reg
				if p.indexCounter != 0 {
					mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
				} else {
					mem = ptr.mem(off * a.Arch.WordBytes)
				}
				h := HintNone
				if i < len(p.hints) {
					h = p.hints[i]
				}
				if h == HintMemOK {
					regs[i][j] = mem
				} else {
					r := p.f.Asm.RegHint(h)
					a.Mov(mem, r)
					regs[i][j] = r
				}
			}
		}
	}
	p.readOff += n
	return regs
}

// StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]).
func (p *Pipe) StoreN(regs [][]Reg) {
	p.needWrite = false
	a := p.f.Asm
	if len(regs) != len(p.outPtr) {
		p.f.Asm.Fatalf("wrong number of output rows")
	}
	n := len(regs[0])
	for i, ptr := range p.outPtr {
		switch {
		case a.Arch.storeIncN != nil:
			// Store to memory and advance pointers at the same time.
			if p.backward {
				a.Arch.storeDecN(a, ptr, regs[i])
			} else {
				a.Arch.storeIncN(a, ptr, regs[i])
			}

		default:
			// Store to memory using offsets.
			// We'll advance the pointers or the index counter later.
			for j, r := range regs[i] {
				off := p.writeOff + j
				if p.backward {
					off = -(off + 1)
				}
				var mem Reg
				if p.indexCounter != 0 {
					mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
				} else {
					mem = ptr.mem(off * a.Arch.WordBytes)
				}
				a.Mov(r, mem)
			}
		}
	}
	p.writeOff += n
}

// advancePtrs advances the pointers by step
// or handles bookkeeping for an imminent index advance by step
// that the caller will do.
func (p *Pipe) advancePtrs(step int) {
	a := p.f.Asm
	switch {
	case a.Arch.loadIncN != nil:
		// nothing to do

	default:
		// Adjust read/write offsets for pointer advance (or imminent index advance).
		p.readOff -= step
		p.writeOff -= step

		if p.indexCounter == 0 {
			// Advance pointers.
			if p.backward {
				step = -step
			}
			for _, ptr := range p.inPtr {
				a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
			}
			for _, ptr := range p.outPtr {
				a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
			}
		}
	}
}

// DropInput deletes the named input from the pipe,
// usually because it has been exhausted.
// (This is not used yet but will be used in a future generator.)
func (p *Pipe) DropInput(name string) {
	i := slices.Index(p.f.inputs, name)
	if i < 0 {
		p.f.Asm.Fatalf("unknown input %s", name)
	}
	ptr := p.inPtr[i]
	p.f.Asm.Free(Reg(ptr))
	p.inPtr = slices.Delete(p.inPtr, i, i+1)
	p.f.inputs = slices.Delete(p.f.inputs, i, i+1)
	if len(p.hints) > i {
		p.hints = slices.Delete(p.hints, i, i+1)
	}
}

// Start prepares to loop over n columns.
// The factors give a sequence of unrolling factors to use,
// which must be either strictly increasing or strictly decreasing
// and must include 1.
// For example, 4, 1 means to process 4 elements at a time
// and then 1 at a time for the final 0-3; specifying 1,4 instead
// handles 0-3 elements first and then 4 at a time.
// Similarly, 32, 4, 1 means to process 32 at a time,
// then 4 at a time, then 1 at a time.
//
// One benefit of using 1, 4 instead of 4, 1 is that the body
// processing 4 at a time needs more registers, and if it is
// the final body, the register holding the fragment count (0-3)
// has been freed and is available for use.
//
// Start may modify the carry flag.
//
// Start must be followed by a call to Loop1 or LoopN,
// but it is permitted to emit other instructions first,
// for example to set an initial carry flag.
func (p *Pipe) Start(n Reg, factors ...int) {
	a := p.f.Asm
	if p.started {
		a.Fatalf("loop already started")
	}
	if p.useIndexCounter && len(factors) > 1 {
		a.Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", factors)
	}
	p.started = true
	if !p.loaded {
		if len(factors) == 1 {
			p.SetUseIndexCounter()
		}
		p.LoadPtrs(n)
	}

	// If there were calls to LoadN between LoadPtrs and Start,
	// adjust the loop not to scan those columns, assuming that
	// either the code already called an equivalent StoreN or else
	// that it will do so after the loop.
	if off := p.readOff; off != 0 {
		if p.indexCounter < 0 {
			// Index is negated, so add off instead of subtracting.
			a.Add(a.Imm(off), n, n, SmashCarry)
		} else {
			a.Sub(a.Imm(off), n, n, SmashCarry)
		}
		if p.indexCounter != 0 {
			// n is also the index we are using, so adjust readOff and writeOff
			// to continue to point at the same positions as before we changed n.
			p.readOff -= off
			p.writeOff -= off
		}
	}

	p.Restart(n, factors...)
}

// Restart prepares to loop over an additional n columns,
// beyond a previous loop run by p.Start/p.Loop.
func (p *Pipe) Restart(n Reg, factors ...int) {
	a := p.f.Asm
	if !p.started {
		a.Fatalf("pipe not started")
	}
	p.factors = factors
	p.counts = make([]Reg, len(factors))
	if len(factors) == 0 {
		factors = []int{1}
	}

	// Compute the loop lengths for each unrolled section into separate registers.
	// We compute them all ahead of time in case the computation would smash
	// a carry flag that the loop bodies need preserved.
	if len(factors) > 1 {
		a.Comment("compute unrolled loop lengths")
	}
	switch {
	default:
		a.Fatalf("invalid factors %v", factors)

	case factors[0] == 1:
		// increasing loop factors
		div := 1
		for i, f := range factors[1:] {
			if f <= factors[i] {
				a.Fatalf("non-increasing factors %v", factors)
			}
			if f&(f-1) != 0 {
				a.Fatalf("non-power-of-two factors %v", factors)
			}
			t := p.f.Asm.Reg()
			f /= div
			a.And(a.Imm(f-1), n, t)
			a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, n)
			div *= f
			p.counts[i] = t
		}
		p.counts[len(p.counts)-1] = n

	case factors[len(factors)-1] == 1:
		// decreasing loop factors
		for i, f := range factors[:len(factors)-1] {
			if f <= factors[i+1] {
				a.Fatalf("non-decreasing factors %v", factors)
			}
			if f&(f-1) != 0 {
				a.Fatalf("non-power-of-two factors %v", factors)
			}
			t := p.f.Asm.Reg()
			a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, t)
			a.And(a.Imm(f-1), n, n)
			p.counts[i] = t
		}
		p.counts[len(p.counts)-1] = n
	}
}

// Done frees all the registers allocated by the pipe.
func (p *Pipe) Done() {
	for _, ptr := range p.inPtr {
		p.f.Asm.Free(Reg(ptr))
	}
	p.inPtr = nil
	for _, ptr := range p.outPtr {
		p.f.Asm.Free(Reg(ptr))
	}
	p.outPtr = nil
	p.index = Reg{}
}

// Loop emits code for the loop, calling block repeatedly to emit code that
// handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p).
// block must call p.StoreN(out) to write N output columns.
// The out slice is a pre-allocated matrix of uninitialized Reg values.
// block is expected to set each entry to the Reg that should be written
// before calling p.StoreN(out).
//
// For example, if the loop is to be unrolled 4x in blocks of 2 columns each,
// the sequence of calls to emit the unrolled loop body is:
//
//	start()  // set by pAtUnrollStart
//	... reads for 2 columns ...
//	block()
//	... writes for 2 columns ...
//	... reads for 2 columns ...
//	block()
//	... writes for 2 columns ...
//	end()  // set by p.AtUnrollEnd
//
// Any registers allocated during block are freed automatically when block returns.
func (p *Pipe) Loop(block func(in, out [][]Reg)) {
	if p.factors == nil {
		p.f.Asm.Fatalf("Pipe.Start not called")
	}
	for i, factor := range p.factors {
		n := p.counts[i]
		p.unroll(n, factor, block)
		if i < len(p.factors)-1 {
			p.f.Asm.Free(n)
		}
	}
	p.factors = nil
}

// AtUnrollStart sets a function to call at the start of an unrolled sequence.
// See [Pipe.Loop] for details.
func (p *Pipe) AtUnrollStart(start func()) {
	p.unrollStart = start
}

// AtUnrollEnd sets a function to call at the end of an unrolled sequence.
// See [Pipe.Loop] for details.
func (p *Pipe) AtUnrollEnd(end func()) {
	p.unrollEnd = end
}

// unroll emits a single unrolled loop for the given factor, iterating n times.
func (p *Pipe) unroll(n Reg, factor int, block func(in, out [][]Reg)) {
	a := p.f.Asm
	label := fmt.Sprintf("%s%d", p.label, factor)

	// Top of loop control flow.
	a.Label(label)
	if a.Arch.loopTop != "" {
		a.Printf("\t"+a.Arch.loopTop+"\n", n, label+"done")
	} else {
		a.JmpZero(n, label+"done")
	}
	a.Label(label + "cont")

	// Unrolled loop body.
	if factor < p.maxColumns {
		a.Comment("unroll %dX", factor)
	} else {
		a.Comment("unroll %dX in batches of %d", factor, p.maxColumns)
	}
	if p.unrollStart != nil {
		p.unrollStart()
	}
	for done := 0; done < factor; {
		batch := min(factor-done, p.maxColumns)
		regs := a.RegsUsed()
		out := make([][]Reg, len(p.outPtr))
		for i := range out {
			out[i] = make([]Reg, batch)
		}
		in := p.LoadN(batch)
		p.needWrite = true
		block(in, out)
		if p.needWrite && len(p.outPtr) > 0 {
			a.Fatalf("missing p.Write1 or p.StoreN")
		}
		a.SetRegsUsed(regs) // free anything block allocated
		done += batch
	}
	if p.unrollEnd != nil {
		p.unrollEnd()
	}
	p.advancePtrs(factor)

	// Bottom of loop control flow.
	switch {
	case p.indexCounter >= 0 && a.Arch.loopBottom != "":
		a.Printf("\t"+a.Arch.loopBottom+"\n", n, label+"cont")

	case p.indexCounter >= 0:
		a.Sub(a.Imm(1), n, n, KeepCarry)
		a.JmpNonZero(n, label+"cont")

	case p.indexCounter < 0 && a.Arch.loopBottomNeg != "":
		a.Printf("\t"+a.Arch.loopBottomNeg+"\n", n, label+"cont")

	case p.indexCounter < 0:
		a.Add(a.Imm(1), n, n, KeepCarry)
	}
	a.Label(label + "done")
}