RRVM 编译器优化-指令调度

基于模拟硬件流水线的指令调度

Featured image

instruction scheduling

注:这是在当时编译器比赛(全国大学生计算机系统能力大赛-编译系统设计赛)实现这个优化 pass 之前写的设计思路文档。big credit to Fuyuki
指令调度大家肯定都很熟悉,我们这里把这个排序过程转成一个优化问题,按照软流水/硬件流水线/降低寄存器压力(优先发射生命周期更早终止的寄存器)等指标设置估值函数,从而用一个保存多状态的动态规划算法解决(viterbi 算法)
项目仓库,主要与其中 commit c42b63f 有关,在比赛过程中,设置惩罚函数为纯硬件流水线

建数据依赖图

对于块中的指令,从后往前遍历,维护以下依赖关系:

惩罚函数

register punishment

首先可以注意一下,不会出现 live_in->use->def->use 的情况,所以 use 一定是在 live_in 或者 def 后面

维护的数据结构

for 每个在这个基本块中出现过的寄存器,维护以下指标:

pipeline punishment

因为现有 punishment 会改变指令顺序,可能降低程序并行能力,反而会导致部分测例上的性能损失,比较 emo
我们根据 software pipelining,对指令顺序进行调整,从而讨好处理器

处理器情况

进而推测我们处理器型号

#include "scheduling.hpp"
#include <queue>
// virtual operand that represents condition register
const MachineOperand COND = MachineOperand{MachineOperand::State::PreColored, 0x40000000};

std::pair<std::vector<MachineOperand>, std::vector<MachineOperand>> get_def_use_scheduling(MachineInst *inst); // 正常写法

enum class CortexA72FUKind { Branch, Integer, IntegerMultiple, Load, Store };

// reference: Cortex-A72 software optimization guide
std::pair<u32, CortexA72FUKind> get_info(MachineInst *inst); // 返回时延和InstrKind

struct Node {
  MachineInst *inst;
  u32 priority;
  u32 latency;
  CortexA72FUKind kind;
  u32 temp;
  std::set<Node *> out_edges;
  std::set<Node *> in_edges;

  Node(MachineInst *inst) : inst(inst), priority(0) {
    auto [l, k] = get_info(inst);
    latency = l;
    kind = k;
  }
};

struct CortexA72FU {
  CortexA72FUKind kind;
  Node *inflight = nullptr;
  u32 complete_cycle = 0;
};

struct NodeCompare {
  bool operator()(Node *const &lhs, const Node *const &rhs) const {
    if (lhs->priority != rhs->priority) return lhs->priority > rhs->priority;
    if (lhs->latency != rhs->latency) return lhs->latency > rhs->latency;
    return false;
  }
};

void instruction_schedule(MachineFunc *f) {
  for (auto bb = f->bb.head; bb; bb = bb->next) {
    // create data dependence graph of instructions
    // instructions that read this register
    std::map<u32, std::vector<Node *>> read_insts;
    // instruction that writes this register
    std::map<u32, Node *> write_insts;
    // loads can be reordered, but not across store and call
    // instruction that might have side effect (store, call)
    Node *side_effect = nullptr;
    std::vector<Node *> load_insts;
    std::vector<Node *> nodes;

    // calculate data dependence graph
    for (auto inst = bb->insts.head; inst; inst = inst->next) {
      if (isa<MIComment>(inst)) {
        continue;
      }
      auto [def, use] = get_def_use_scheduling(inst);
      auto node = new Node(inst);
      nodes.push_back(node);
      for (auto &u : use) {
        if (u.is_reg()) {
          // add edges for read-after-write
          if (auto &w = write_insts[u.value]) {
            w->out_edges.insert(node);
            node->in_edges.insert(w);
          }
        }
      }

      for (auto &d : def) {
        if (d.is_reg()) {
          // add edges for write-after-read
          for (auto &r : read_insts[d.value]) {
            r->out_edges.insert(node);
            node->in_edges.insert(r);
          }
          // add edges for write-after-write
          if (auto &w = write_insts[d.value]) {
            w->out_edges.insert(node);
            node->in_edges.insert(w);
          }
        }
      }

      for (auto &u : use) {
        if (u.is_reg()) {
          // update read_insts
          read_insts[u.value].push_back(node);
        }
      }

      for (auto &d : def) {
        if (d.is_reg()) {
          // update read_insts and write_insts
          read_insts[d.value].clear();
          write_insts[d.value] = node;
        }
      }

      // don't schedule instructions with side effect
      if (isa<MIStore>(inst) || isa<MICall>(inst)) { // todo 把这个优化加了
        if (side_effect) {
          side_effect->out_edges.insert(node);
          node->in_edges.insert(side_effect);
        }
        for (auto &n : load_insts) {
          n->out_edges.insert(node);
          node->in_edges.insert(n);
        }
        load_insts.clear();
      } else if (isa<MILoad>(inst)) {
        if (side_effect) {
          side_effect->out_edges.insert(node);
          node->in_edges.insert(side_effect);
        }
        load_insts.push_back(node);
      }

      if (isa<MIStore>(inst) || isa<MICall>(inst)) {
        side_effect = node;
      }
      // should be put at the end of bb
      if (isa<MIBranch>(inst) || isa<MIJump>(inst) || isa<MIReturn>(inst)) {
        for (auto &n : nodes) {
          if (n != node) {
            n->out_edges.insert(node);
            node->in_edges.insert(n);
          }
        }
      }
    }

    // calculate priority
    // temp is out_degree in this part
    std::vector<Node *> vis;
    for (auto &n : nodes) {
      n->temp = n->out_edges.size();
      if (n->out_edges.empty()) {
        vis.push_back(n);
        n->priority = n->latency;
      }
    }
    while (!vis.empty()) { // dfs
      Node *n = vis.back();
      vis.pop_back();
      for (auto &t : n->in_edges) {
        t->priority = std::max(t->priority, t->latency + n->priority);
        t->temp--;
        if (t->temp == 0) {
          vis.push_back(t);
        }
      }
    }

    // functional units
    // see cortex a72 software optimisation
    CortexA72FU units[] = {
        {CortexA72FUKind::Branch},          {CortexA72FUKind::Integer}, {CortexA72FUKind::Integer},
        {CortexA72FUKind::IntegerMultiple}, {CortexA72FUKind::Load},    {CortexA72FUKind::Store},
    };
    u32 num_inflight = 0;

    // schedule
    // removes instructions
    bb->control_transfer_inst = nullptr;
    bb->insts.head = bb->insts.tail = nullptr;
    // ready list
    std::vector<Node *> ready;
    // temp is in_degree in this part
    for (auto &n : nodes) {
      n->temp = n->in_edges.size();
      if (n->in_edges.empty()) {
        ready.push_back(n);
      }
    }

    u32 cycle = 0;
    while (!ready.empty() || num_inflight > 0) {
      std::sort(ready.begin(), ready.end(), NodeCompare{});
      for (u32 i = 0; i < ready.size();) {
        auto inst = ready[i];
        auto kind = inst->kind;
        bool fired = false;
        for (auto &f : units) {
          if (f.kind == kind && f.inflight == nullptr) {
            // fire!
            bb->insts.insertAtEnd(inst->inst);
            num_inflight++;
            f.inflight = inst;
            f.complete_cycle = cycle + inst->latency;
            ready.erase(ready.begin() + i);
            fired = true;
            break;
          }
        }

        if (!fired) {
          i++;
        }
      }

      cycle++;
      for (auto &unit : units) {
        if (unit.complete_cycle == cycle && unit.inflight) {
          // finish
          // put nodes to ready
          for (auto &t : unit.inflight->out_edges) {
            t->temp--;
            if (t->temp == 0) {
              ready.push_back(t);
            }
          }
          unit.inflight = nullptr;
          num_inflight--;
        }
      }
    }
  }
}

rrvm-算法

todo: 如果是小基本块的话,思考能不能用 A* 算法