nuudlman/llvm: llvm/lib/Target/AMDGPU/SIMachineScheduler.h

//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// SI Machine Scheduler interface
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H

#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include <cstdint>
#include <set>
#include <vector>

namespace llvm {

class SIInstrInfo;
class SIRegisterInfo;
class SIScheduleDAGMI;
class SIScheduleBlockCreator;

enum SIScheduleCandReason {
  NoCand,
  RegUsage,
  Latency,
  Successor,
  Depth,
  NodeOrder
};

struct SISchedulerCandidate {
  // The reason for this candidate.
  SIScheduleCandReason Reason = NoCand;

  // Set of reasons that apply to multiple candidates.
  uint32_t RepeatReasonSet = 0;

  SISchedulerCandidate() = default;

  bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
  void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
};

enum SIScheduleBlockLinkKind {
  NoData,
  Data
};

class SIScheduleBlock {
  SIScheduleDAGMI *DAG;
  SIScheduleBlockCreator *BC;

  std::vector<SUnit*> SUnits;
  std::map<unsigned, unsigned> NodeNum2Index;
  std::vector<SUnit*> TopReadySUs;
  std::vector<SUnit*> ScheduledSUnits;

  /// The top of the unscheduled zone.
  IntervalPressure TopPressure;
  RegPressureTracker TopRPTracker;

  // Pressure: number of said class of registers needed to
  // store the live virtual and real registers.
  // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
  // Pressure of additional registers required inside the block.
  std::vector<unsigned> InternalAdditionalPressure;
  // Pressure of input and output registers
  std::vector<unsigned> LiveInPressure;
  std::vector<unsigned> LiveOutPressure;
  // Registers required by the block, and outputs.
  // We do track only virtual registers.
  // Note that some registers are not 32 bits,
  // and thus the pressure is not equal
  // to the number of live registers.
  std::set<unsigned> LiveInRegs;
  std::set<unsigned> LiveOutRegs;

  bool Scheduled = false;
  bool HighLatencyBlock = false;

  std::vector<unsigned> HasLowLatencyNonWaitedParent;

  // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
  unsigned ID;

  std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
  // All blocks successors, and the kind of link
  std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
  unsigned NumHighLatencySuccessors = 0;

public:
  SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
                  unsigned ID):
    DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}

  ~SIScheduleBlock() = default;

  unsigned getID() const { return ID; }

  /// Functions for Block construction.
  void addUnit(SUnit *SU);

  // When all SUs have been added.
  void finalizeUnits();

  // Add block pred, which has instruction predecessor of SU.
  void addPred(SIScheduleBlock *Pred);
  void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);

  const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
  ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
    getSuccs() const { return Succs; }

  unsigned Height;  // Maximum topdown path length to block without outputs
  unsigned Depth;   // Maximum bottomup path length to block without inputs

  unsigned getNumHighLatencySuccessors() const {
    return NumHighLatencySuccessors;
  }

  bool isHighLatencyBlock() { return HighLatencyBlock; }

  // This is approximative.
  // Ideally should take into accounts some instructions (rcp, etc)
  // are 4 times slower.
  int getCost() { return SUnits.size(); }

  // The block Predecessors and Successors must be all registered
  // before fastSchedule().
  // Fast schedule with no particular requirement.
  void fastSchedule();

  std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }

  // Complete schedule that will try to minimize reg pressure and
  // low latencies, and will fill liveins and liveouts.
  // Needs all MIs to be grouped between BeginBlock and EndBlock.
  // The MIs can be moved after the scheduling,
  // it is just used to allow correct track of live registers.
  void schedule(MachineBasicBlock::iterator BeginBlock,
                MachineBasicBlock::iterator EndBlock);

  bool isScheduled() { return Scheduled; }

  // Needs the block to be scheduled inside
  // TODO: find a way to compute it.
  std::vector<unsigned> &getInternalAdditionalRegUsage() {
    return InternalAdditionalPressure;
  }

  std::set<unsigned> &getInRegs() { return LiveInRegs; }
  std::set<unsigned> &getOutRegs() { return LiveOutRegs; }

  void printDebug(bool Full);

private:
  struct SISchedCandidate : SISchedulerCandidate {
    // The best SUnit candidate.
    SUnit *SU = nullptr;

    unsigned SGPRUsage;
    unsigned VGPRUsage;
    bool IsLowLatency;
    unsigned LowLatencyOffset;
    bool HasLowLatencyNonWaitedParent;

    SISchedCandidate() = default;

    bool isValid() const { return SU; }

    // Copy the status of another candidate without changing policy.
    void setBest(SISchedCandidate &Best) {
      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
      SU = Best.SU;
      Reason = Best.Reason;
      SGPRUsage = Best.SGPRUsage;
      VGPRUsage = Best.VGPRUsage;
      IsLowLatency = Best.IsLowLatency;
      LowLatencyOffset = Best.LowLatencyOffset;
      HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
    }
  };

  void undoSchedule();

  void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
  void releaseSucc(SUnit *SU, SDep *SuccEdge);
  // InOrOutBlock: restrict to links pointing inside the block (true),
  // or restrict to links pointing outside the block (false).
  void releaseSuccessors(SUnit *SU, bool InOrOutBlock);

  void nodeScheduled(SUnit *SU);
  void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
  void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
  SUnit* pickNode();
  void traceCandidate(const SISchedCandidate &Cand);
  void initRegPressure(MachineBasicBlock::iterator BeginBlock,
                       MachineBasicBlock::iterator EndBlock);
};

struct SIScheduleBlocks {
  std::vector<SIScheduleBlock*> Blocks;
  std::vector<int> TopDownIndex2Block;
  std::vector<int> TopDownBlock2Index;
};

enum SISchedulerBlockCreatorVariant {
  LatenciesAlone,
  LatenciesGrouped,
  LatenciesAlonePlusConsecutive
};

class SIScheduleBlockCreator {
  SIScheduleDAGMI *DAG;
  // unique_ptr handles freeing memory for us.
  std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
  std::map<SISchedulerBlockCreatorVariant,
           SIScheduleBlocks> Blocks;
  std::vector<SIScheduleBlock*> CurrentBlocks;
  std::vector<int> Node2CurrentBlock;

  // Topological sort
  // Maps topological index to the node number.
  std::vector<int> TopDownIndex2Block;
  std::vector<int> TopDownBlock2Index;
  std::vector<int> BottomUpIndex2Block;

  // 0 -> Color not given.
  // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
  // Above -> Other groups.
  int NextReservedID;
  int NextNonReservedID;
  std::vector<int> CurrentColoring;
  std::vector<int> CurrentTopDownReservedDependencyColoring;
  std::vector<int> CurrentBottomUpReservedDependencyColoring;

public:
  SIScheduleBlockCreator(SIScheduleDAGMI *DAG);

  SIScheduleBlocks
  getBlocks(SISchedulerBlockCreatorVariant BlockVariant);

  bool isSUInBlock(SUnit *SU, unsigned ID);

private:
  // Give a Reserved color to every high latency.
  void colorHighLatenciesAlone();

  // Create groups of high latencies with a Reserved color.
  void colorHighLatenciesGroups();

  // Compute coloring for topdown and bottom traversals with
  // different colors depending on dependencies on Reserved colors.
  void colorComputeReservedDependencies();

  // Give color to all non-colored SUs according to Reserved groups dependencies.
  void colorAccordingToReservedDependencies();

  // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
  // The new colors are computed according to the dependencies on the other blocks
  // formed with colorAccordingToReservedDependencies.
  void colorEndsAccordingToDependencies();

  // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
  void colorForceConsecutiveOrderInGroup();

  // Merge Constant loads that have all their users into another group to the group.
  // (TODO: else if all their users depend on the same group, put them there)
  void colorMergeConstantLoadsNextGroup();

  // Merge SUs that have all their users into another group to the group
  void colorMergeIfPossibleNextGroup();

  // Merge SUs that have all their users into another group to the group,
  // but only for Reserved groups.
  void colorMergeIfPossibleNextGroupOnlyForReserved();

  // Merge SUs that have all their users into another group to the group,
  // but only if the group is no more than a few SUs.
  void colorMergeIfPossibleSmallGroupsToNextGroup();

  // Divides Blocks with important size.
  // Idea of implementation: attribute new colors depending on topdown and
  // bottom up links to other blocks.
  void cutHugeBlocks();

  // Put in one group all instructions with no users in this scheduling region
  // (we'd want these groups be at the end).
  void regroupNoUserInstructions();

  // Give Reserved color to export instructions
  void colorExports();

  void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);

  void topologicalSort();

  void scheduleInsideBlocks();

  void fillStats();
};

enum SISchedulerBlockSchedulerVariant {
  BlockLatencyRegUsage,
  BlockRegUsageLatency,
  BlockRegUsage
};

class SIScheduleBlockScheduler {
  SIScheduleDAGMI *DAG;
  SISchedulerBlockSchedulerVariant Variant;
  std::vector<SIScheduleBlock*> Blocks;

  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
  std::set<unsigned> LiveRegs;
  // Num of schedulable unscheduled blocks reading the register.
  std::map<unsigned, unsigned> LiveRegsConsumers;

  std::vector<unsigned> LastPosHighLatencyParentScheduled;
  int LastPosWaitedHighLatency;

  std::vector<SIScheduleBlock*> BlocksScheduled;
  unsigned NumBlockScheduled;
  std::vector<SIScheduleBlock*> ReadyBlocks;

  unsigned VregCurrentUsage;
  unsigned SregCurrentUsage;

  // Currently is only approximation.
  unsigned maxVregUsage;
  unsigned maxSregUsage;

  std::vector<unsigned> BlockNumPredsLeft;
  std::vector<unsigned> BlockNumSuccsLeft;

public:
  SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
                           SISchedulerBlockSchedulerVariant Variant,
                           SIScheduleBlocks BlocksStruct);
  ~SIScheduleBlockScheduler() = default;

  std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }

  unsigned getVGPRUsage() { return maxVregUsage; }
  unsigned getSGPRUsage() { return maxSregUsage; }

private:
  struct SIBlockSchedCandidate : SISchedulerCandidate {
    // The best Block candidate.
    SIScheduleBlock *Block = nullptr;

    bool IsHighLatency;
    int VGPRUsageDiff;
    unsigned NumSuccessors;
    unsigned NumHighLatencySuccessors;
    unsigned LastPosHighLatParentScheduled;
    unsigned Height;

    SIBlockSchedCandidate() = default;

    bool isValid() const { return Block; }

    // Copy the status of another candidate without changing policy.
    void setBest(SIBlockSchedCandidate &Best) {
      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
      Block = Best.Block;
      Reason = Best.Reason;
      IsHighLatency = Best.IsHighLatency;
      VGPRUsageDiff = Best.VGPRUsageDiff;
      NumSuccessors = Best.NumSuccessors;
      NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
      LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
      Height = Best.Height;
    }
  };

  bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
                           SIBlockSchedCandidate &TryCand);
  bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
                            SIBlockSchedCandidate &TryCand);
  SIScheduleBlock *pickBlock();

  void addLiveRegs(std::set<unsigned> &Regs);
  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
  void releaseBlockSuccs(SIScheduleBlock *Parent);
  void blockScheduled(SIScheduleBlock *Block);

  // Check register pressure change
  // by scheduling a block with these LiveIn and LiveOut.
  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
                                       std::set<unsigned> &OutRegs);

  void schedule();
};

struct SIScheduleBlockResult {
  std::vector<unsigned> SUs;
  unsigned MaxSGPRUsage;
  unsigned MaxVGPRUsage;
};

class SIScheduler {
  SIScheduleDAGMI *DAG;
  SIScheduleBlockCreator BlockCreator;

public:
  SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}

  ~SIScheduler() = default;

  struct SIScheduleBlockResult
  scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
                  SISchedulerBlockSchedulerVariant ScheduleVariant);
};

class SIScheduleDAGMI final : public ScheduleDAGMILive {
  const SIInstrInfo *SITII;
  const SIRegisterInfo *SITRI;

  std::vector<SUnit> SUnitsLinksBackup;

  // For moveLowLatencies. After all Scheduling variants are tested.
  std::vector<unsigned> ScheduledSUnits;
  std::vector<unsigned> ScheduledSUnitsInv;

public:
  SIScheduleDAGMI(MachineSchedContext *C);

  ~SIScheduleDAGMI() override;

  // Entry point for the schedule.
  void schedule() override;

  // To init Block's RPTracker.
  void initRPTracker(RegPressureTracker &RPTracker) {
    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
  }

  MachineBasicBlock *getBB() { return BB; }
  MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
  MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
  LiveIntervals *getLIS() { return LIS; }
  MachineRegisterInfo *getMRI() { return &MRI; }
  const TargetRegisterInfo *getTRI() { return TRI; }
  ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
  SUnit &getEntrySU() { return EntrySU; }
  SUnit& getExitSU() { return ExitSU; }

  void restoreSULinksLeft();

  template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
                                                     _Iterator End,
                                                     unsigned &VgprUsage,
                                                     unsigned &SgprUsage);

  std::set<unsigned> getInRegs() {
    std::set<unsigned> InRegs;
    for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
      InRegs.insert(RegMaskPair.RegUnit);
    }
    return InRegs;
  }

  std::set<unsigned> getOutRegs() {
    std::set<unsigned> OutRegs;
    for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
      OutRegs.insert(RegMaskPair.RegUnit);
    }
    return OutRegs;
  };

private:
  void topologicalSort();
  // After scheduling is done, improve low latency placements.
  void moveLowLatencies();

public:
  // Some stats for scheduling inside blocks.
  std::vector<unsigned> IsLowLatencySU;
  std::vector<unsigned> LowLatencyOffset;
  std::vector<unsigned> IsHighLatencySU;
  // Topological sort
  // Maps topological index to the node number.
  std::vector<int> TopDownIndex2SU;
  std::vector<int> BottomUpIndex2SU;
};

} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H