// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package asmgen import ( "fmt" "math/bits" "slices" ) // Note: Exported fields and methods are expected to be used // by function generators (like the ones in add.go and so on). // Unexported fields and methods should not be. // A Pipe manages the input and output data pipelines for a function's // memory operations. // // The input is one or more equal-length slices of words, so collectively // it can be viewed as a matrix, in which each slice is a row and each column // is a set of corresponding words from the different slices. // The output can be viewed the same way, although it is often just one row. type Pipe struct { f *Func // function being generated label string // prefix for loop labels (default "loop") backward bool // processing columns in reverse started bool // Start has been called loaded bool // LoadPtrs has been called inPtr []RegPtr // input slice pointers hints []Hint // for each inPtr, a register hint to use for its data outPtr []RegPtr // output slice pointers index Reg // index register, if in use useIndexCounter bool // index counter requested indexCounter int // index is also counter (386); 0 no, -1 negative counter, +1 positive counter readOff int // read offset not yet added to index writeOff int // write offset not yet added to index factors []int // unrolling factors counts []Reg // iterations for each factor needWrite bool // need a write call during Loop1/LoopN maxColumns int // maximum columns during unrolled loop unrollStart func() // emit code at start of unrolled body unrollEnd func() // emit code end of unrolled body } // Pipe creates and returns a new pipe for use in the function f. func (f *Func) Pipe() *Pipe { a := f.Asm p := &Pipe{ f: f, label: "loop", maxColumns: 10000000, } if m := a.Arch.maxColumns; m != 0 { p.maxColumns = m } return p } // SetBackward sets the pipe to process the input and output columns in reverse order. // This is needed for left shifts, which might otherwise overwrite data they will read later. func (p *Pipe) SetBackward() { if p.loaded { p.f.Asm.Fatalf("SetBackward after Start/LoadPtrs") } p.backward = true } // SetUseIndexCounter sets the pipe to use an index counter if possible, // meaning the loop counter is also used as an index for accessing the slice data. // This clever trick is slower on modern processors, but it is still necessary on 386. // On non-386 systems, SetUseIndexCounter is a no-op. func (p *Pipe) SetUseIndexCounter() { if p.f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it) return } p.useIndexCounter = true } // SetLabel sets the label prefix for the loops emitted by the pipe. // The default prefix is "loop". func (p *Pipe) SetLabel(label string) { p.label = label } // SetMaxColumns sets the maximum number of // columns processed in a single loop body call. func (p *Pipe) SetMaxColumns(m int) { p.maxColumns = m } // SetHint records that the inputs from the named vector // should be allocated with the given register hint. // // If the hint indicates a single register on the target architecture, // then SetHint calls SetMaxColumns(1), since the hinted register // can only be used for one value at a time. func (p *Pipe) SetHint(name string, hint Hint) { if hint == HintMemOK && !p.f.Asm.Arch.memOK { return } i := slices.Index(p.f.inputs, name) if i < 0 { p.f.Asm.Fatalf("unknown input name %s", name) } if p.f.Asm.hint(hint) != "" { p.SetMaxColumns(1) } for len(p.hints) <= i { p.hints = append(p.hints, HintNone) } p.hints[i] = hint } // LoadPtrs loads the slice pointer arguments into registers, // assuming that the slice length n has already been loaded // into the register n. // // Start will call LoadPtrs if it has not been called already. // LoadPtrs only needs to be called explicitly when code needs // to use LoadN before Start, like when the shift.go generators // read an initial word before the loop. func (p *Pipe) LoadPtrs(n Reg) { a := p.f.Asm if p.loaded { a.Fatalf("pointers already loaded") } // Load the actual pointers. p.loaded = true for _, name := range p.f.inputs { p.inPtr = append(p.inPtr, RegPtr(p.f.Arg(name+"_base"))) } for _, name := range p.f.outputs { p.outPtr = append(p.outPtr, RegPtr(p.f.Arg(name+"_base"))) } // Decide the memory access strategy for LoadN and StoreN. switch { case p.backward && p.useIndexCounter: // Generator wants an index counter, meaning when the iteration counter // is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes). // The loop is moving backward through the slice, but the counter // is also moving backward, so not much to do. a.Comment("run loop backward, using counter as positive index") p.indexCounter = +1 p.index = n case !p.backward && p.useIndexCounter: // Generator wants an index counter, but the loop is moving forward. // To make the counter move in the direction of data access, // we negate the counter, counting up from -len(z) to -1. // To make the index access the right words, we add len(z)*WordBytes // to each of the pointers. // See comment below about the garbage collector (non-)implications // of pointing beyond the slice bounds. a.Comment("use counter as negative index") p.indexCounter = -1 p.index = n for _, ptr := range p.inPtr { a.AddWords(n, ptr, ptr) } for _, ptr := range p.outPtr { a.AddWords(n, ptr, ptr) } a.Neg(n, n) case p.backward: // Generator wants to run the loop backward. // We'll decrement the pointers before using them, // so position them at the very end of the slices. // If we had precise pointer information for assembly, // these pointers would cause problems with the garbage collector, // since they no longer point into the allocated slice, // but the garbage collector ignores unexpected values in assembly stacks, // and the actual slice pointers are still in the argument stack slots, // so the slices won't be collected early. // If we switched to the register ABI, we might have to rethink this. // (The same thing happens by the end of forward loops, // but it's less important since once the pointers go off the slice // in a forward loop, the loop is over and the slice won't be accessed anymore.) a.Comment("run loop backward") for _, ptr := range p.inPtr { a.AddWords(n, ptr, ptr) } for _, ptr := range p.outPtr { a.AddWords(n, ptr, ptr) } case !p.backward: // Nothing to do! } } // LoadN returns the next n columns of input words as a slice of rows. // Regs for inputs that have been marked using p.SetMemOK will be direct memory references. // Regs for other inputs will be newly allocated registers and must be freed. func (p *Pipe) LoadN(n int) [][]Reg { a := p.f.Asm regs := make([][]Reg, len(p.inPtr)) for i, ptr := range p.inPtr { regs[i] = make([]Reg, n) switch { case a.Arch.loadIncN != nil: // Load from memory and advance pointers at the same time. for j := range regs[i] { regs[i][j] = p.f.Asm.Reg() } if p.backward { a.Arch.loadDecN(a, ptr, regs[i]) } else { a.Arch.loadIncN(a, ptr, regs[i]) } default: // Load from memory using offsets. // We'll advance the pointers or the index counter later. for j := range n { off := p.readOff + j if p.backward { off = -(off + 1) } var mem Reg if p.indexCounter != 0 { mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr) } else { mem = ptr.mem(off * a.Arch.WordBytes) } h := HintNone if i < len(p.hints) { h = p.hints[i] } if h == HintMemOK { regs[i][j] = mem } else { r := p.f.Asm.RegHint(h) a.Mov(mem, r) regs[i][j] = r } } } } p.readOff += n return regs } // StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]). func (p *Pipe) StoreN(regs [][]Reg) { p.needWrite = false a := p.f.Asm if len(regs) != len(p.outPtr) { p.f.Asm.Fatalf("wrong number of output rows") } n := len(regs[0]) for i, ptr := range p.outPtr { switch { case a.Arch.storeIncN != nil: // Store to memory and advance pointers at the same time. if p.backward { a.Arch.storeDecN(a, ptr, regs[i]) } else { a.Arch.storeIncN(a, ptr, regs[i]) } default: // Store to memory using offsets. // We'll advance the pointers or the index counter later. for j, r := range regs[i] { off := p.writeOff + j if p.backward { off = -(off + 1) } var mem Reg if p.indexCounter != 0 { mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr) } else { mem = ptr.mem(off * a.Arch.WordBytes) } a.Mov(r, mem) } } } p.writeOff += n } // advancePtrs advances the pointers by step // or handles bookkeeping for an imminent index advance by step // that the caller will do. func (p *Pipe) advancePtrs(step int) { a := p.f.Asm switch { case a.Arch.loadIncN != nil: // nothing to do default: // Adjust read/write offsets for pointer advance (or imminent index advance). p.readOff -= step p.writeOff -= step if p.indexCounter == 0 { // Advance pointers. if p.backward { step = -step } for _, ptr := range p.inPtr { a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry) } for _, ptr := range p.outPtr { a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry) } } } } // DropInput deletes the named input from the pipe, // usually because it has been exhausted. // (This is not used yet but will be used in a future generator.) func (p *Pipe) DropInput(name string) { i := slices.Index(p.f.inputs, name) if i < 0 { p.f.Asm.Fatalf("unknown input %s", name) } ptr := p.inPtr[i] p.f.Asm.Free(Reg(ptr)) p.inPtr = slices.Delete(p.inPtr, i, i+1) p.f.inputs = slices.Delete(p.f.inputs, i, i+1) if len(p.hints) > i { p.hints = slices.Delete(p.hints, i, i+1) } } // Start prepares to loop over n columns. // The factors give a sequence of unrolling factors to use, // which must be either strictly increasing or strictly decreasing // and must include 1. // For example, 4, 1 means to process 4 elements at a time // and then 1 at a time for the final 0-3; specifying 1,4 instead // handles 0-3 elements first and then 4 at a time. // Similarly, 32, 4, 1 means to process 32 at a time, // then 4 at a time, then 1 at a time. // // One benefit of using 1, 4 instead of 4, 1 is that the body // processing 4 at a time needs more registers, and if it is // the final body, the register holding the fragment count (0-3) // has been freed and is available for use. // // Start may modify the carry flag. // // Start must be followed by a call to Loop1 or LoopN, // but it is permitted to emit other instructions first, // for example to set an initial carry flag. func (p *Pipe) Start(n Reg, factors ...int) { a := p.f.Asm if p.started { a.Fatalf("loop already started") } if p.useIndexCounter && len(factors) > 1 { a.Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", factors) } p.started = true if !p.loaded { if len(factors) == 1 { p.SetUseIndexCounter() } p.LoadPtrs(n) } // If there were calls to LoadN between LoadPtrs and Start, // adjust the loop not to scan those columns, assuming that // either the code already called an equivalent StoreN or else // that it will do so after the loop. if off := p.readOff; off != 0 { if p.indexCounter < 0 { // Index is negated, so add off instead of subtracting. a.Add(a.Imm(off), n, n, SmashCarry) } else { a.Sub(a.Imm(off), n, n, SmashCarry) } if p.indexCounter != 0 { // n is also the index we are using, so adjust readOff and writeOff // to continue to point at the same positions as before we changed n. p.readOff -= off p.writeOff -= off } } p.Restart(n, factors...) } // Restart prepares to loop over an additional n columns, // beyond a previous loop run by p.Start/p.Loop. func (p *Pipe) Restart(n Reg, factors ...int) { a := p.f.Asm if !p.started { a.Fatalf("pipe not started") } p.factors = factors p.counts = make([]Reg, len(factors)) if len(factors) == 0 { factors = []int{1} } // Compute the loop lengths for each unrolled section into separate registers. // We compute them all ahead of time in case the computation would smash // a carry flag that the loop bodies need preserved. if len(factors) > 1 { a.Comment("compute unrolled loop lengths") } switch { default: a.Fatalf("invalid factors %v", factors) case factors[0] == 1: // increasing loop factors div := 1 for i, f := range factors[1:] { if f <= factors[i] { a.Fatalf("non-increasing factors %v", factors) } if f&(f-1) != 0 { a.Fatalf("non-power-of-two factors %v", factors) } t := p.f.Asm.Reg() f /= div a.And(a.Imm(f-1), n, t) a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, n) div *= f p.counts[i] = t } p.counts[len(p.counts)-1] = n case factors[len(factors)-1] == 1: // decreasing loop factors for i, f := range factors[:len(factors)-1] { if f <= factors[i+1] { a.Fatalf("non-decreasing factors %v", factors) } if f&(f-1) != 0 { a.Fatalf("non-power-of-two factors %v", factors) } t := p.f.Asm.Reg() a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, t) a.And(a.Imm(f-1), n, n) p.counts[i] = t } p.counts[len(p.counts)-1] = n } } // Done frees all the registers allocated by the pipe. func (p *Pipe) Done() { for _, ptr := range p.inPtr { p.f.Asm.Free(Reg(ptr)) } p.inPtr = nil for _, ptr := range p.outPtr { p.f.Asm.Free(Reg(ptr)) } p.outPtr = nil p.index = Reg{} } // Loop emits code for the loop, calling block repeatedly to emit code that // handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p). // block must call p.StoreN(out) to write N output columns. // The out slice is a pre-allocated matrix of uninitialized Reg values. // block is expected to set each entry to the Reg that should be written // before calling p.StoreN(out). // // For example, if the loop is to be unrolled 4x in blocks of 2 columns each, // the sequence of calls to emit the unrolled loop body is: // // start() // set by pAtUnrollStart // ... reads for 2 columns ... // block() // ... writes for 2 columns ... // ... reads for 2 columns ... // block() // ... writes for 2 columns ... // end() // set by p.AtUnrollEnd // // Any registers allocated during block are freed automatically when block returns. func (p *Pipe) Loop(block func(in, out [][]Reg)) { if p.factors == nil { p.f.Asm.Fatalf("Pipe.Start not called") } for i, factor := range p.factors { n := p.counts[i] p.unroll(n, factor, block) if i < len(p.factors)-1 { p.f.Asm.Free(n) } } p.factors = nil } // AtUnrollStart sets a function to call at the start of an unrolled sequence. // See [Pipe.Loop] for details. func (p *Pipe) AtUnrollStart(start func()) { p.unrollStart = start } // AtUnrollEnd sets a function to call at the end of an unrolled sequence. // See [Pipe.Loop] for details. func (p *Pipe) AtUnrollEnd(end func()) { p.unrollEnd = end } // unroll emits a single unrolled loop for the given factor, iterating n times. func (p *Pipe) unroll(n Reg, factor int, block func(in, out [][]Reg)) { a := p.f.Asm label := fmt.Sprintf("%s%d", p.label, factor) // Top of loop control flow. a.Label(label) if a.Arch.loopTop != "" { a.Printf("\t"+a.Arch.loopTop+"\n", n, label+"done") } else { a.JmpZero(n, label+"done") } a.Label(label + "cont") // Unrolled loop body. if factor < p.maxColumns { a.Comment("unroll %dX", factor) } else { a.Comment("unroll %dX in batches of %d", factor, p.maxColumns) } if p.unrollStart != nil { p.unrollStart() } for done := 0; done < factor; { batch := min(factor-done, p.maxColumns) regs := a.RegsUsed() out := make([][]Reg, len(p.outPtr)) for i := range out { out[i] = make([]Reg, batch) } in := p.LoadN(batch) p.needWrite = true block(in, out) if p.needWrite && len(p.outPtr) > 0 { a.Fatalf("missing p.Write1 or p.StoreN") } a.SetRegsUsed(regs) // free anything block allocated done += batch } if p.unrollEnd != nil { p.unrollEnd() } p.advancePtrs(factor) // Bottom of loop control flow. switch { case p.indexCounter >= 0 && a.Arch.loopBottom != "": a.Printf("\t"+a.Arch.loopBottom+"\n", n, label+"cont") case p.indexCounter >= 0: a.Sub(a.Imm(1), n, n, KeepCarry) a.JmpNonZero(n, label+"cont") case p.indexCounter < 0 && a.Arch.loopBottomNeg != "": a.Printf("\t"+a.Arch.loopBottomNeg+"\n", n, label+"cont") case p.indexCounter < 0: a.Add(a.Imm(1), n, n, KeepCarry) } a.Label(label + "done") }