Compiler projects using llvm
//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines a hazard recognizer for the SystemZ scheduler.
//
// This class is used by the SystemZ scheduling strategy to maintain
// the state during scheduling, and provide cost functions for
// scheduling candidates. This includes:
//
// * Decoder grouping. A decoder group can maximally hold 3 uops, and
// instructions that always begin a new group should be scheduled when
// the current decoder group is empty.
// * Processor resources usage. It is beneficial to balance the use of
// resources.
//
// A goal is to consider all instructions, also those outside of any
// scheduling region. Such instructions are "advanced" past and include
// single instructions before a scheduling region, branches etc.
//
// A block that has only one predecessor continues scheduling with the state
// of it (which may be updated by emitting branches).
//
// ===---------------------------------------------------------------------===//

#include "SystemZHazardRecognizer.h"
#include "llvm/ADT/Statistic.h"

using namespace llvm;

#define DEBUG_TYPE "machine-scheduler"

// This is the limit of processor resource usage at which the
// scheduler should try to look for other instructions (not using the
// critical resource).
static cl::opt<int> ProcResCostLim("procres-cost-lim", cl::Hidden,
                                   cl::desc("The OOO window for processor "
                                            "resources during scheduling."),
                                   cl::init(8));

unsigned SystemZHazardRecognizer::
getNumDecoderSlots(SUnit *SU) const {
  const MCSchedClassDesc *SC = getSchedClass(SU);
  if (!SC->isValid())
    return 0; // IMPLICIT_DEF / KILL -- will not make impact in output.

  assert((SC->NumMicroOps != 2 || (SC->BeginGroup && !SC->EndGroup)) &&
         "Only cracked instruction can have 2 uops.");
  assert((SC->NumMicroOps < 3 || (SC->BeginGroup && SC->EndGroup)) &&
         "Expanded instructions always group alone.");
  assert((SC->NumMicroOps < 3 || (SC->NumMicroOps % 3 == 0)) &&
         "Expanded instructions fill the group(s).");

  return SC->NumMicroOps;
}

unsigned SystemZHazardRecognizer::getCurrCycleIdx(SUnit *SU) const {
  unsigned Idx = CurrGroupSize;
  if (GrpCount % 2)
    Idx += 3;

  if (SU != nullptr && !fitsIntoCurrentGroup(SU)) {
    if (Idx == 1 || Idx == 2)
      Idx = 3;
    else if (Idx == 4 || Idx == 5)
      Idx = 0;
  }

  return Idx;
}

ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer::
getHazardType(SUnit *SU, int Stalls) {
  return (fitsIntoCurrentGroup(SU) ? NoHazard : Hazard);
}

void SystemZHazardRecognizer::Reset() {
  CurrGroupSize = 0;
  CurrGroupHas4RegOps = false;
  clearProcResCounters();
  GrpCount = 0;
  LastFPdOpCycleIdx = UINT_MAX;
  LastEmittedMI = nullptr;
  LLVM_DEBUG(CurGroupDbg = "";);
}

bool
SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
  const MCSchedClassDesc *SC = getSchedClass(SU);
  if (!SC->isValid())
    return true;

  // A cracked instruction only fits into schedule if the current
  // group is empty.
  if (SC->BeginGroup)
    return (CurrGroupSize == 0);

  // An instruction with 4 register operands will not fit in last slot.
  assert ((CurrGroupSize < 2 || !CurrGroupHas4RegOps) &&
          "Current decoder group is already full!");
  if (CurrGroupSize == 2 && has4RegOps(SU->getInstr()))
    return false;

  // Since a full group is handled immediately in EmitInstruction(),
  // SU should fit into current group. NumSlots should be 1 or 0,
  // since it is not a cracked or expanded instruction.
  assert ((getNumDecoderSlots(SU) <= 1) && (CurrGroupSize < 3) &&
          "Expected normal instruction to fit in non-full group!");

  return true;
}

bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const {
  const MachineFunction &MF = *MI->getParent()->getParent();
  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
  const MCInstrDesc &MID = MI->getDesc();
  unsigned Count = 0;
  for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) {
    const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI, MF);
    if (RC == nullptr)
      continue;
    if (OpIdx >= MID.getNumDefs() &&
        MID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
      continue;
    Count++;
  }
  return Count >= 4;
}

void SystemZHazardRecognizer::nextGroup() {
  if (CurrGroupSize == 0)
    return;

  LLVM_DEBUG(dumpCurrGroup("Completed decode group"));
  LLVM_DEBUG(CurGroupDbg = "";);

  int NumGroups = ((CurrGroupSize > 3) ? (CurrGroupSize / 3) : 1);
  assert((CurrGroupSize <= 3 || CurrGroupSize % 3 == 0) &&
         "Current decoder group bad.");

  // Reset counter for next group.
  CurrGroupSize = 0;
  CurrGroupHas4RegOps = false;

  GrpCount += ((unsigned) NumGroups);

  // Decrease counters for execution units.
  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
    ProcResourceCounters[i] = ((ProcResourceCounters[i] > NumGroups)
                                   ? (ProcResourceCounters[i] - NumGroups)
                                   : 0);

  // Clear CriticalResourceIdx if it is now below the threshold.
  if (CriticalResourceIdx != UINT_MAX &&
      (ProcResourceCounters[CriticalResourceIdx] <=
       ProcResCostLim))
    CriticalResourceIdx = UINT_MAX;

  LLVM_DEBUG(dumpState(););
}

#ifndef NDEBUG // Debug output
void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
  OS << "SU(" << SU->NodeNum << "):";
  OS << TII->getName(SU->getInstr()->getOpcode());

  const MCSchedClassDesc *SC = getSchedClass(SU);
  if (!SC->isValid())
    return;

  for (TargetSchedModel::ProcResIter
         PI = SchedModel->getWriteProcResBegin(SC),
         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
    const MCProcResourceDesc &PRD =
      *SchedModel->getProcResource(PI->ProcResourceIdx);
    std::string FU(PRD.Name);
    // trim e.g. Z13_FXaUnit -> FXa
    FU = FU.substr(FU.find('_') + 1);
    size_t Pos = FU.find("Unit");
    if (Pos != std::string::npos)
      FU.resize(Pos);
    if (FU == "LS") // LSUnit -> LSU
      FU = "LSU";
    OS << "/" << FU;

    if (PI->Cycles > 1)
      OS << "(" << PI->Cycles << "cyc)";
  }

  if (SC->NumMicroOps > 1)
    OS << "/" << SC->NumMicroOps << "uops";
  if (SC->BeginGroup && SC->EndGroup)
    OS << "/GroupsAlone";
  else if (SC->BeginGroup)
    OS << "/BeginsGroup";
  else if (SC->EndGroup)
    OS << "/EndsGroup";
  if (SU->isUnbuffered)
    OS << "/Unbuffered";
  if (has4RegOps(SU->getInstr()))
    OS << "/4RegOps";
}

void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
  dbgs() << "++ " << Msg;
  dbgs() << ": ";

  if (CurGroupDbg.empty())
    dbgs() << " <empty>\n";
  else {
    dbgs() << "{ " << CurGroupDbg << " }";
    dbgs() << " (" << CurrGroupSize << " decoder slot"
           << (CurrGroupSize > 1 ? "s":"")
           << (CurrGroupHas4RegOps ? ", 4RegOps" : "")
           << ")\n";
  }
}

void SystemZHazardRecognizer::dumpProcResourceCounters() const {
  bool any = false;

  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
    if (ProcResourceCounters[i] > 0) {
      any = true;
      break;
    }

  if (!any)
    return;

  dbgs() << "++ | Resource counters: ";
  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
    if (ProcResourceCounters[i] > 0)
      dbgs() << SchedModel->getProcResource(i)->Name
             << ":" << ProcResourceCounters[i] << " ";
  dbgs() << "\n";

  if (CriticalResourceIdx != UINT_MAX)
    dbgs() << "++ | Critical resource: "
           << SchedModel->getProcResource(CriticalResourceIdx)->Name
           << "\n";
}

void SystemZHazardRecognizer::dumpState() const {
  dumpCurrGroup("| Current decoder group");
  dbgs() << "++ | Current cycle index: "
         << getCurrCycleIdx() << "\n";
  dumpProcResourceCounters();
  if (LastFPdOpCycleIdx != UINT_MAX)
    dbgs() << "++ | Last FPd cycle index: " << LastFPdOpCycleIdx << "\n";
}

#endif //NDEBUG

void SystemZHazardRecognizer::clearProcResCounters() {
  ProcResourceCounters.assign(SchedModel->getNumProcResourceKinds(), 0);
  CriticalResourceIdx = UINT_MAX;
}

static inline bool isBranchRetTrap(MachineInstr *MI) {
  return (MI->isBranch() || MI->isReturn() ||
          MI->getOpcode() == SystemZ::CondTrap);
}

// Update state with SU as the next scheduled unit.
void SystemZHazardRecognizer::
EmitInstruction(SUnit *SU) {
  const MCSchedClassDesc *SC = getSchedClass(SU);
  LLVM_DEBUG(dbgs() << "++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
             dbgs() << "\n";);
  LLVM_DEBUG(dumpCurrGroup("Decode group before emission"););

  // If scheduling an SU that must begin a new decoder group, move on
  // to next group.
  if (!fitsIntoCurrentGroup(SU))
    nextGroup();

  LLVM_DEBUG(raw_string_ostream cgd(CurGroupDbg);
             if (CurGroupDbg.length()) cgd << ", "; dumpSU(SU, cgd););

  LastEmittedMI = SU->getInstr();

  // After returning from a call, we don't know much about the state.
  if (SU->isCall) {
    LLVM_DEBUG(dbgs() << "++ Clearing state after call.\n";);
    Reset();
    LastEmittedMI = SU->getInstr();
    return;
  }

  // Increase counter for execution unit(s).
  for (TargetSchedModel::ProcResIter
         PI = SchedModel->getWriteProcResBegin(SC),
         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
    // Don't handle FPd together with the other resources.
    if (SchedModel->getProcResource(PI->ProcResourceIdx)->BufferSize == 1)
      continue;
    int &CurrCounter =
      ProcResourceCounters[PI->ProcResourceIdx];
    CurrCounter += PI->Cycles;
    // Check if this is now the new critical resource.
    if ((CurrCounter > ProcResCostLim) &&
        (CriticalResourceIdx == UINT_MAX ||
         (PI->ProcResourceIdx != CriticalResourceIdx &&
          CurrCounter >
          ProcResourceCounters[CriticalResourceIdx]))) {
      LLVM_DEBUG(
          dbgs() << "++ New critical resource: "
                 << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
                 << "\n";);
      CriticalResourceIdx = PI->ProcResourceIdx;
    }
  }

  // Make note of an instruction that uses a blocking resource (FPd).
  if (SU->isUnbuffered) {
    LastFPdOpCycleIdx = getCurrCycleIdx(SU);
    LLVM_DEBUG(dbgs() << "++ Last FPd cycle index: " << LastFPdOpCycleIdx
                      << "\n";);
  }

  // Insert SU into current group by increasing number of slots used
  // in current group.
  CurrGroupSize += getNumDecoderSlots(SU);
  CurrGroupHas4RegOps |= has4RegOps(SU->getInstr());
  unsigned GroupLim = (CurrGroupHas4RegOps ? 2 : 3);
  assert((CurrGroupSize <= GroupLim || CurrGroupSize == getNumDecoderSlots(SU))
         && "SU does not fit into decoder group!");

  // Check if current group is now full/ended. If so, move on to next
  // group to be ready to evaluate more candidates.
  if (CurrGroupSize >= GroupLim || SC->EndGroup)
    nextGroup();
}

int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
  const MCSchedClassDesc *SC = getSchedClass(SU);
  if (!SC->isValid())
    return 0;

  // If SU begins new group, it can either break a current group early
  // or fit naturally if current group is empty (negative cost).
  if (SC->BeginGroup) {
    if (CurrGroupSize)
      return 3 - CurrGroupSize;
    return -1;
  }

  // Similarly, a group-ending SU may either fit well (last in group), or
  // end the group prematurely.
  if (SC->EndGroup) {
    unsigned resultingGroupSize =
      (CurrGroupSize + getNumDecoderSlots(SU));
    if (resultingGroupSize < 3)
      return (3 - resultingGroupSize);
    return -1;
  }

  // An instruction with 4 register operands will not fit in last slot.
  if (CurrGroupSize == 2 && has4RegOps(SU->getInstr()))
    return 1;

  // Most instructions can be placed in any decoder slot.
  return 0;
}

bool SystemZHazardRecognizer::isFPdOpPreferred_distance(SUnit *SU) const {
  assert (SU->isUnbuffered);
  // If this is the first FPd op, it should be scheduled high.
  if (LastFPdOpCycleIdx == UINT_MAX)
    return true;
  // If this is not the first PFd op, it should go into the other side
  // of the processor to use the other FPd unit there. This should
  // generally happen if two FPd ops are placed with 2 other
  // instructions between them (modulo 6).
  unsigned SUCycleIdx = getCurrCycleIdx(SU);
  if (LastFPdOpCycleIdx > SUCycleIdx)
    return ((LastFPdOpCycleIdx - SUCycleIdx) == 3);
  return ((SUCycleIdx - LastFPdOpCycleIdx) == 3);
}

int SystemZHazardRecognizer::
resourcesCost(SUnit *SU) {
  int Cost = 0;

  const MCSchedClassDesc *SC = getSchedClass(SU);
  if (!SC->isValid())
    return 0;

  // For a FPd op, either return min or max value as indicated by the
  // distance to any prior FPd op.
  if (SU->isUnbuffered)
    Cost = (isFPdOpPreferred_distance(SU) ? INT_MIN : INT_MAX);
  // For other instructions, give a cost to the use of the critical resource.
  else if (CriticalResourceIdx != UINT_MAX) {
    for (TargetSchedModel::ProcResIter
           PI = SchedModel->getWriteProcResBegin(SC),
           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI)
      if (PI->ProcResourceIdx == CriticalResourceIdx)
        Cost = PI->Cycles;
  }

  return Cost;
}

void SystemZHazardRecognizer::emitInstruction(MachineInstr *MI,
                                              bool TakenBranch) {
  // Make a temporary SUnit.
  SUnit SU(MI, 0);

  // Set interesting flags.
  SU.isCall = MI->isCall();

  const MCSchedClassDesc *SC = SchedModel->resolveSchedClass(MI);
  for (const MCWriteProcResEntry &PRE :
         make_range(SchedModel->getWriteProcResBegin(SC),
                    SchedModel->getWriteProcResEnd(SC))) {
    switch (SchedModel->getProcResource(PRE.ProcResourceIdx)->BufferSize) {
    case 0:
      SU.hasReservedResource = true;
      break;
    case 1:
      SU.isUnbuffered = true;
      break;
    default:
      break;
    }
  }

  unsigned GroupSizeBeforeEmit = CurrGroupSize;
  EmitInstruction(&SU);

  if (!TakenBranch && isBranchRetTrap(MI)) {
    // NT Branch on second slot ends group.
    if (GroupSizeBeforeEmit == 1)
      nextGroup();
  }

  if (TakenBranch && CurrGroupSize > 0)
    nextGroup();

  assert ((!MI->isTerminator() || isBranchRetTrap(MI)) &&
          "Scheduler: unhandled terminator!");
}

void SystemZHazardRecognizer::
copyState(SystemZHazardRecognizer *Incoming) {
  // Current decoder group
  CurrGroupSize = Incoming->CurrGroupSize;
  LLVM_DEBUG(CurGroupDbg = Incoming->CurGroupDbg;);

  // Processor resources
  ProcResourceCounters = Incoming->ProcResourceCounters;
  CriticalResourceIdx = Incoming->CriticalResourceIdx;

  // FPd
  LastFPdOpCycleIdx = Incoming->LastFPdOpCycleIdx;
  GrpCount = Incoming->GrpCount;
}