Skip to content

Commit

Permalink
[vm] Improve same-as-first constraint handling
Browse files Browse the repository at this point in the history
All other moves generated by the register allocator to satisfy
constraints were using `PrefersRegister` for the source, but
`SameAsFirstInput` was using `Any`. This lead to situations
where hot code inside a loop would repeatedly reload a constant.
To avoid these situations switch `SameAsFirstInput` to use
`PrefersRegister` when the use occurs inside a loop.

Additionally introduce an extension of `SameAsFirstInput`:
`SameAsFirstOrSecondInput`. This new constraint allows register
allocator to reorder first and second inputs if the second one
is no longer alive after the instruction. This allows register
allocator to avoid unnecessary move.

`SameAsFirstOrSecondInput` can be used as an output constraint
for commutative binary operations on X64

Additionally this CL adds a nascent infrastructure for writing
unit tests against register allocator. See `linearscan_test.cc`.

This CL improves code quality for tight loops with binary
double operations written with constants on the left, for example:


    loop {
      doubleA = C * doubleB
    }

Before this CL `C` would be reloaded into a register immediately
before multiplication, but after this CL it will be kept in
register (if register pressure allows).

Issue #56705

TEST=LinearScan_TestSameAsFirstOrSecond*

Change-Id: Id8e8242a8d1c1d1b8958076f10257e21e4a00aae
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/385001
Reviewed-by: Alexander Markov <[email protected]>
Commit-Queue: Slava Egorov <[email protected]>
  • Loading branch information
mraleph authored and Commit Queue committed Sep 20, 2024
1 parent f7e3aa2 commit 34fa3c1
Show file tree
Hide file tree
Showing 11 changed files with 328 additions and 23 deletions.
1 change: 1 addition & 0 deletions runtime/vm/compiler/backend/flow_graph_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1849,6 +1849,7 @@ void FlowGraphCompiler::AllocateRegistersLocally(Instruction* instr) {
Location::RegisterLocation(AllocateFreeRegister(blocked_registers));
break;
case Location::kSameAsFirstInput:
case Location::kSameAsFirstOrSecondInput:
result_location = locs->in(0);
break;
case Location::kRequiresFpuRegister:
Expand Down
27 changes: 9 additions & 18 deletions runtime/vm/compiler/backend/il.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2226,30 +2226,21 @@ Definition* BinaryDoubleOpInstr::Canonicalize(FlowGraph* flow_graph) {
return square;
}

if (left()->BindsToConstant() && !right()->BindsToConstant() &&
Token::IsCommutativeOp(op_kind())) {
Value* l = left();
Value* r = right();
SetInputAt(0, r);
SetInputAt(1, l);
}

return this;
}

Definition* DoubleTestOpInstr::Canonicalize(FlowGraph* flow_graph) {
return HasUses() ? this : nullptr;
}

static bool IsCommutative(Token::Kind op) {
switch (op) {
case Token::kMUL:
FALL_THROUGH;
case Token::kADD:
FALL_THROUGH;
case Token::kBIT_AND:
FALL_THROUGH;
case Token::kBIT_OR:
FALL_THROUGH;
case Token::kBIT_XOR:
return true;
default:
return false;
}
}

UnaryIntegerOpInstr* UnaryIntegerOpInstr::Make(Representation representation,
Token::Kind op_kind,
Value* value,
Expand Down Expand Up @@ -2430,7 +2421,7 @@ Definition* BinaryIntegerOpInstr::Canonicalize(FlowGraph* flow_graph) {
}

if (left()->BindsToConstant() && !right()->BindsToConstant() &&
IsCommutative(op_kind())) {
Token::IsCommutativeOp(op_kind())) {
Value* l = left();
Value* r = right();
SetInputAt(0, r);
Expand Down
1 change: 1 addition & 0 deletions runtime/vm/compiler/backend/il.h
Original file line number Diff line number Diff line change
Expand Up @@ -1733,6 +1733,7 @@ class BlockEntryInstr : public TemplateInstruction<0, NoThrow> {
bool InsideTryBlock() const { return try_index_ != kInvalidTryIndex; }

// Loop related methods.
bool IsInsideLoop() { return loop_info_ != nullptr; }
LoopInfo* loop_info() const { return loop_info_; }
void set_loop_info(LoopInfo* loop_info) { loop_info_ = loop_info; }
bool IsLoopHeader() const;
Expand Down
12 changes: 9 additions & 3 deletions runtime/vm/compiler/backend/il_x64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4090,7 +4090,9 @@ LocationSummary* BinaryDoubleOpInstr::MakeLocationSummary(Zone* zone,
LocationSummary(zone, kNumInputs, kNumTemps, LocationSummary::kNoCall);
summary->set_in(0, Location::RequiresFpuRegister());
summary->set_in(1, Location::RequiresFpuRegister());
summary->set_out(0, Location::SameAsFirstInput());
summary->set_out(0, Token::IsCommutativeOp(op_kind())
? Location::SameAsFirstOrSecondInput()
: Location::SameAsFirstInput());
return summary;
}

Expand Down Expand Up @@ -5934,7 +5936,9 @@ LocationSummary* BinaryInt64OpInstr::MakeLocationSummary(Zone* zone,
zone, kNumInputs, kNumTemps, LocationSummary::kNoCall);
summary->set_in(0, Location::RequiresRegister());
summary->set_in(1, LocationRegisterOrConstant(right()));
summary->set_out(0, Location::SameAsFirstInput());
summary->set_out(0, Token::IsCommutativeOp(op_kind())
? Location::SameAsFirstOrSecondInput()
: Location::SameAsFirstInput());
return summary;
}
}
Expand Down Expand Up @@ -6341,7 +6345,9 @@ LocationSummary* BinaryUint32OpInstr::MakeLocationSummary(Zone* zone,
LocationSummary(zone, kNumInputs, kNumTemps, LocationSummary::kNoCall);
summary->set_in(0, Location::RequiresRegister());
summary->set_in(1, Location::RequiresRegister());
summary->set_out(0, Location::SameAsFirstInput());
summary->set_out(0, Token::IsCommutativeOp(op_kind())
? Location::SameAsFirstOrSecondInput()
: Location::SameAsFirstInput());
return summary;
}

Expand Down
55 changes: 53 additions & 2 deletions runtime/vm/compiler/backend/linearscan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1338,8 +1338,11 @@ void FlowGraphAllocator::ProcessOneOutput(BlockEntryInstr* block,
in_ref->Equals(Location::RequiresFpuRegister()));
*out = *in_ref;
// Create move that will copy value between input and output.
MoveOperands* move =
AddMoveAt(pos, Location::RequiresRegister(), Location::Any());
// Inside loops prefer to allocate a register for the value for this
// move, but do not require it.
MoveOperands* move = AddMoveAt(
pos, Location::RequiresRegister(),
block->IsInsideLoop() ? Location::PrefersRegister() : Location::Any());

// Add uses to the live range of the input.
LiveRange* input_range = GetLiveRange(input_vreg);
Expand Down Expand Up @@ -1377,6 +1380,34 @@ void FlowGraphAllocator::ProcessOneOutput(BlockEntryInstr* block,
CompleteRange(range, def->RegisterKindForResult());
}

bool FlowGraphAllocator::IsDeadAfterCurrentInstruction(BlockEntryInstr* block,
Instruction* current,
Definition* defn) {
// Do not bother with pair representations for now.
if (defn->HasPairRepresentation()) {
return false;
}

auto range = GetLiveRange(defn->vreg(0));

// Register allocator is building live ranges by visiting blocks in
// postorder and iterating instructions within blocks backwards. When
// we start iterating the block for each value which is live out of the block
// we prepend a use interval covering the whole block to the live range of
// the block. This means all uses which we encounter are being monotonically
// prepended to the start of the range. See |BuildLiveRanges| and |DefineAt|
// for more details.
//
// In other words: it is only possible for a value to have a use *after* the
// current instruction if corresponding range is not empty and it starts
// with a use interval which starts within the current block. It might be
// either interval corresponding to the real use within the block or an
// artificial interval which spans the whole block created for the value
// which flows out of the block.
return range->first_use_interval() == nullptr ||
range->first_use_interval()->start() >= block->end_pos();
}

// Create and update live ranges corresponding to instruction's inputs,
// temporaries and output.
void FlowGraphAllocator::ProcessOneInstruction(BlockEntryInstr* block,
Expand Down Expand Up @@ -1436,6 +1467,26 @@ void FlowGraphAllocator::ProcessOneInstruction(BlockEntryInstr* block,
}
}

if (locs->out(0).IsUnallocated() &&
(locs->out(0).policy() == Location::kSameAsFirstOrSecondInput)) {
auto in_left = locs->in(0);
auto in_right = locs->in(1);
// Check if operation has the same constraint on both inputs.
if (in_left.Equals(in_right)) {
// If the first input outlives this instruction but the second does not,
// then we should flip them to reduce register pressure and avoid
// redundant move.
auto defn_left = current->InputAt(0)->definition();
auto defn_right = current->InputAt(1)->definition();
if (!IsDeadAfterCurrentInstruction(block, current, defn_left) &&
IsDeadAfterCurrentInstruction(block, current, defn_right)) {
current->InputAt(0)->BindTo(defn_right);
current->InputAt(1)->BindTo(defn_left);
}
}
locs->set_out(0, Location::SameAsFirstInput());
}

const bool output_same_as_first_input =
locs->out(0).IsUnallocated() &&
(locs->out(0).policy() == Location::kSameAsFirstInput);
Expand Down
7 changes: 7 additions & 0 deletions runtime/vm/compiler/backend/linearscan.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,13 @@ class FlowGraphAllocator : public ValueObject {
intptr_t pos,
Location::Kind kind);

// Returns true if |defn| is not used after |current|.
//
// Only works during range construction (e.g. ProcessOneInstruction).
bool IsDeadAfterCurrentInstruction(BlockEntryInstr* block,
Instruction* current,
Definition* defn);

void PrintLiveRanges();

// Assign locations for each outgoing argument. Outgoing argumenst are
Expand Down
Loading

0 comments on commit 34fa3c1

Please sign in to comment.