Skip to content

Commit 2e53a68

Browse files
authored
[flang][runtime] Speed up initialization & destruction (#148087)
Rework derived type initialization in the runtime to just initialize the first element of any array, and then memcpy it to the others, rather than exercising the per-component paths for each element. Reword derived type destruction in the runtime to detect and exploit a fast path for allocatable components whose types themselves don't need nested destruction. Small tweaks were made in hot paths exposed by profiling in descriptor operations and derived type assignment.
1 parent 18286e0 commit 2e53a68

File tree

8 files changed

+241
-157
lines changed

8 files changed

+241
-157
lines changed

flang-rt/include/flang-rt/runtime/descriptor.h

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,16 @@
2020

2121
#include "memory.h"
2222
#include "type-code.h"
23+
#include "flang-rt/runtime/allocator-registry.h"
2324
#include "flang/Common/ISO_Fortran_binding_wrapper.h"
25+
#include "flang/Common/optional.h"
2426
#include "flang/Runtime/descriptor-consts.h"
2527
#include <algorithm>
2628
#include <cassert>
2729
#include <cinttypes>
2830
#include <cstddef>
2931
#include <cstdio>
32+
#include <cstdlib>
3033
#include <cstring>
3134

3235
/// Value used for asyncObject when no specific stream is specified.
@@ -262,9 +265,20 @@ class Descriptor {
262265

263266
template <typename A>
264267
RT_API_ATTRS A *ZeroBasedIndexedElement(std::size_t n) const {
265-
SubscriptValue at[maxRank];
266-
if (SubscriptsForZeroBasedElementNumber(at, n)) {
267-
return Element<A>(at);
268+
if (raw_.rank == 0) {
269+
if (n == 0) {
270+
return OffsetElement<A>();
271+
}
272+
} else if (raw_.rank == 1) {
273+
const auto &dim{GetDimension(0)};
274+
if (n < static_cast<std::size_t>(dim.Extent())) {
275+
return OffsetElement<A>(n * dim.ByteStride());
276+
}
277+
} else {
278+
SubscriptValue at[maxRank];
279+
if (SubscriptsForZeroBasedElementNumber(at, n)) {
280+
return Element<A>(at);
281+
}
268282
}
269283
return nullptr;
270284
}
@@ -366,6 +380,18 @@ class Descriptor {
366380
RT_API_ATTRS std::size_t SizeInBytes() const;
367381

368382
RT_API_ATTRS std::size_t Elements() const;
383+
RT_API_ATTRS std::size_t InlineElements() const {
384+
int n{rank()};
385+
if (n == 0) {
386+
return 1;
387+
} else {
388+
auto elements{static_cast<std::size_t>(GetDimension(0).Extent())};
389+
for (int j{1}; j < n; ++j) {
390+
elements *= GetDimension(j).Extent();
391+
}
392+
return elements;
393+
}
394+
}
369395

370396
// Allocate() assumes Elements() and ElementBytes() work;
371397
// define the extents of the dimensions and the element length
@@ -377,7 +403,22 @@ class Descriptor {
377403

378404
// Deallocates storage; does not call FINAL subroutines or
379405
// deallocate allocatable/automatic components.
380-
RT_API_ATTRS int Deallocate();
406+
RT_API_ATTRS int Deallocate() {
407+
ISO::CFI_cdesc_t &descriptor{raw()};
408+
void *pointer{descriptor.base_addr};
409+
if (!pointer) {
410+
return CFI_ERROR_BASE_ADDR_NULL;
411+
} else {
412+
int allocIndex{MapAllocIdx()};
413+
if (allocIndex == kDefaultAllocator) {
414+
std::free(pointer);
415+
} else {
416+
allocatorRegistry.GetDeallocator(MapAllocIdx())(pointer);
417+
}
418+
descriptor.base_addr = nullptr;
419+
return CFI_SUCCESS;
420+
}
421+
}
381422

382423
// Deallocates storage, including allocatable and automatic
383424
// components. Optionally invokes FINAL subroutines.
@@ -392,8 +433,7 @@ class Descriptor {
392433
bool stridesAreContiguous{true};
393434
for (int j{0}; j < leadingDimensions; ++j) {
394435
const Dimension &dim{GetDimension(j)};
395-
stridesAreContiguous &=
396-
(bytes == dim.ByteStride()) || (dim.Extent() == 1);
436+
stridesAreContiguous &= bytes == dim.ByteStride() || dim.Extent() == 1;
397437
bytes *= dim.Extent();
398438
}
399439
// One and zero element arrays are contiguous even if the descriptor
@@ -406,6 +446,32 @@ class Descriptor {
406446
return stridesAreContiguous || bytes == 0;
407447
}
408448

449+
// The result, if any, is a fixed stride value that can be used to
450+
// address all elements. It generalizes contiguity by also allowing
451+
// the case of an array with extent 1 on all but one dimension.
452+
RT_API_ATTRS common::optional<SubscriptValue> FixedStride() const {
453+
auto rank{static_cast<std::size_t>(raw_.rank)};
454+
common::optional<SubscriptValue> stride;
455+
for (std::size_t j{0}; j < rank; ++j) {
456+
const Dimension &dim{GetDimension(j)};
457+
auto extent{dim.Extent()};
458+
if (extent == 0) {
459+
break; // empty array
460+
} else if (extent == 1) { // ok
461+
} else if (stride) {
462+
// Extent > 1 on multiple dimensions
463+
if (IsContiguous()) {
464+
return ElementBytes();
465+
} else {
466+
return common::nullopt;
467+
}
468+
} else {
469+
stride = dim.ByteStride();
470+
}
471+
}
472+
return stride.value_or(0); // 0 for scalars and empty arrays
473+
}
474+
409475
// Establishes a pointer to a section or element.
410476
RT_API_ATTRS bool EstablishPointerSection(const Descriptor &source,
411477
const SubscriptValue *lower = nullptr,
@@ -427,6 +493,14 @@ class Descriptor {
427493
RT_API_ATTRS inline int GetAllocIdx() const {
428494
return (raw_.extra & _CFI_ALLOCATOR_IDX_MASK) >> _CFI_ALLOCATOR_IDX_SHIFT;
429495
}
496+
RT_API_ATTRS int MapAllocIdx() const {
497+
#ifdef RT_DEVICE_COMPILATION
498+
// Force default allocator in device code.
499+
return kDefaultAllocator;
500+
#else
501+
return GetAllocIdx();
502+
#endif
503+
}
430504
RT_API_ATTRS inline void SetAllocIdx(int pos) {
431505
raw_.extra &= ~_CFI_ALLOCATOR_IDX_MASK; // Clear the allocator index bits.
432506
raw_.extra |= pos << _CFI_ALLOCATOR_IDX_SHIFT;

flang-rt/include/flang-rt/runtime/type-info.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,9 @@ class Component {
6868
RT_API_ATTRS std::uint64_t offset() const { return offset_; }
6969
RT_API_ATTRS const Value &characterLen() const { return characterLen_; }
7070
RT_API_ATTRS const DerivedType *derivedType() const {
71-
return derivedType_.descriptor().OffsetElement<const DerivedType>();
71+
return category() == TypeCategory::Derived
72+
? derivedType_.descriptor().OffsetElement<const DerivedType>()
73+
: nullptr;
7274
}
7375
RT_API_ATTRS const Value *lenValue() const {
7476
return lenValue_.descriptor().OffsetElement<const Value>();

flang-rt/include/flang-rt/runtime/work-queue.h

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
#include "flang-rt/runtime/stat.h"
6363
#include "flang-rt/runtime/type-info.h"
6464
#include "flang/Common/api-attrs.h"
65+
#include "flang/Common/optional.h"
6566
#include "flang/Runtime/freestanding-tools.h"
6667
#include <flang/Common/variant.h>
6768

@@ -124,7 +125,7 @@ class Elementwise {
124125

125126
protected:
126127
const Descriptor &instance_, *from_{nullptr};
127-
std::size_t elements_{instance_.Elements()};
128+
std::size_t elements_{instance_.InlineElements()};
128129
std::size_t elementAt_{0};
129130
SubscriptValue subscripts_[common::maxRank];
130131
SubscriptValue fromSubscripts_[common::maxRank];
@@ -133,11 +134,19 @@ class Elementwise {
133134
// Base class for ticket workers that operate over derived type components.
134135
class Componentwise {
135136
public:
136-
RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
137+
RT_API_ATTRS Componentwise(const typeInfo::DerivedType &derived)
138+
: derived_{derived}, components_{derived_.component().InlineElements()} {
139+
GetFirstComponent();
140+
}
141+
137142
RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
138143
RT_API_ATTRS void Advance() {
139144
++componentAt_;
140-
GetComponent();
145+
if (IsComplete()) {
146+
component_ = nullptr;
147+
} else {
148+
++component_;
149+
}
141150
}
142151
RT_API_ATTRS void SkipToEnd() {
143152
component_ = nullptr;
@@ -146,15 +155,21 @@ class Componentwise {
146155
RT_API_ATTRS void Reset() {
147156
component_ = nullptr;
148157
componentAt_ = 0;
149-
GetComponent();
158+
GetFirstComponent();
150159
}
151-
RT_API_ATTRS void GetComponent();
152160

153161
protected:
154162
const typeInfo::DerivedType &derived_;
155163
std::size_t components_{0}, componentAt_{0};
156164
const typeInfo::Component *component_{nullptr};
157165
StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
166+
167+
private:
168+
RT_API_ATTRS void GetFirstComponent() {
169+
if (components_ > 0) {
170+
component_ = derived_.component().OffsetElement<typeInfo::Component>();
171+
}
172+
}
158173
};
159174

160175
// Base class for ticket workers that operate over derived type components
@@ -230,14 +245,14 @@ class ElementsOverComponents : public Elementwise, public Componentwise {
230245

231246
// Ticket worker classes
232247

233-
// Implements derived type instance initialization
248+
// Implements derived type instance initialization.
234249
class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
235-
private ComponentsOverElements {
250+
private ElementsOverComponents {
236251
public:
237252
RT_API_ATTRS InitializeTicket(
238253
const Descriptor &instance, const typeInfo::DerivedType &derived)
239254
: ImmediateTicketRunner<InitializeTicket>{*this},
240-
ComponentsOverElements{instance, derived} {}
255+
ElementsOverComponents{instance, derived} {}
241256
RT_API_ATTRS int Begin(WorkQueue &);
242257
RT_API_ATTRS int Continue(WorkQueue &);
243258
};
@@ -285,12 +300,14 @@ class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
285300
RT_API_ATTRS DestroyTicket(const Descriptor &instance,
286301
const typeInfo::DerivedType &derived, bool finalize)
287302
: ImmediateTicketRunner<DestroyTicket>{*this},
288-
ComponentsOverElements{instance, derived}, finalize_{finalize} {}
303+
ComponentsOverElements{instance, derived}, finalize_{finalize},
304+
fixedStride_{instance.FixedStride()} {}
289305
RT_API_ATTRS int Begin(WorkQueue &);
290306
RT_API_ATTRS int Continue(WorkQueue &);
291307

292308
private:
293309
bool finalize_{false};
310+
std::optional<SubscriptValue> fixedStride_;
294311
};
295312

296313
// Implements general intrinsic assignment
@@ -304,11 +321,11 @@ class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
304321
RT_API_ATTRS int Continue(WorkQueue &);
305322

306323
private:
324+
RT_API_ATTRS Descriptor &GetTempDescriptor();
307325
RT_API_ATTRS bool IsSimpleMemmove() const {
308326
return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
309327
from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
310328
}
311-
RT_API_ATTRS Descriptor &GetTempDescriptor();
312329

313330
Descriptor &to_;
314331
const Descriptor *from_{nullptr};
@@ -552,6 +569,7 @@ class WorkQueue {
552569
TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
553570
TicketList static_[numStatic_];
554571
TicketList *firstFree_{static_};
572+
bool anyDynamicAllocation_{false};
555573
};
556574

557575
RT_OFFLOAD_API_GROUP_END

flang-rt/lib/runtime/assign.cpp

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@ static RT_API_ATTRS void DoElementalDefinedAssignment(const Descriptor &to,
217217
toElementDesc.Establish(derived, nullptr, 0, nullptr, CFI_attribute_pointer);
218218
fromElementDesc.Establish(
219219
derived, nullptr, 0, nullptr, CFI_attribute_pointer);
220-
for (std::size_t toElements{to.Elements()}; toElements-- > 0;
221-
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
220+
for (std::size_t toElements{to.InlineElements()}; toElements-- > 0;
221+
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
222222
toElementDesc.set_base_addr(to.Element<char>(toAt));
223223
fromElementDesc.set_base_addr(from.Element<char>(fromAt));
224224
DoScalarDefinedAssignment(toElementDesc, fromElementDesc, derived, special);
@@ -431,11 +431,14 @@ RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
431431
}
432432
}
433433
// Intrinsic assignment
434-
std::size_t toElements{to_.Elements()};
435-
if (from_->rank() > 0 && toElements != from_->Elements()) {
436-
workQueue.terminator().Crash("Assign: mismatching element counts in array "
437-
"assignment (to %zd, from %zd)",
438-
toElements, from_->Elements());
434+
std::size_t toElements{to_.InlineElements()};
435+
if (from_->rank() > 0) {
436+
std::size_t fromElements{from_->InlineElements()};
437+
if (toElements != fromElements) {
438+
workQueue.terminator().Crash("Assign: mismatching element counts in "
439+
"array assignment (to %zd, from %zd)",
440+
toElements, fromElements);
441+
}
439442
}
440443
if (to_.type() != from_->type()) {
441444
workQueue.terminator().Crash(
@@ -529,7 +532,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
529532
// allocatable components or defined ASSIGNMENT(=) at any level.
530533
memmoveFct_(this->instance_.template OffsetElement<char>(),
531534
this->from_->template OffsetElement<const char *>(),
532-
this->instance_.Elements() * elementBytes);
535+
this->instance_.InlineElements() * elementBytes);
533536
return StatOk;
534537
}
535538
}
@@ -544,7 +547,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
544547
// Copy procedure pointer components
545548
const Descriptor &procPtrDesc{this->derived_.procPtr()};
546549
bool noDataComponents{this->IsComplete()};
547-
if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
550+
if (std::size_t numProcPtrs{procPtrDesc.InlineElements()}) {
548551
for (std::size_t k{0}; k < numProcPtrs; ++k) {
549552
const auto &procPtr{
550553
*procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
@@ -615,7 +618,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
615618
memmoveFct_(to, from, componentByteSize);
616619
}
617620
}
618-
this->Componentwise::Advance();
621+
this->SkipToNextComponent();
619622
} else {
620623
memmoveFct_(
621624
this->instance_.template Element<char>(this->subscripts_) +
@@ -648,7 +651,7 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
648651
memmoveFct_(to, from, componentByteSize);
649652
}
650653
}
651-
this->Componentwise::Advance();
654+
this->SkipToNextComponent();
652655
} else {
653656
memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
654657
this->component_->offset(),
@@ -670,11 +673,11 @@ RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
670673
if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
671674
if (toDesc->IsAllocated()) {
672675
if (this->phase_ == 0) {
673-
this->phase_++;
674676
if (componentDerived && !componentDerived->noDestructionNeeded()) {
675677
if (int status{workQueue.BeginDestroy(
676678
*toDesc, *componentDerived, /*finalize=*/false)};
677679
status != StatOk) {
680+
this->phase_++;
678681
return status;
679682
}
680683
}
@@ -727,15 +730,15 @@ RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
727730
SubscriptValue allocAt[maxRank];
728731
alloc.GetLowerBounds(allocAt);
729732
if (allocDerived) {
730-
for (std::size_t n{alloc.Elements()}; n-- > 0;
731-
alloc.IncrementSubscripts(allocAt)) {
733+
for (std::size_t n{alloc.InlineElements()}; n-- > 0;
734+
alloc.IncrementSubscripts(allocAt)) {
732735
Descriptor allocElement{*Descriptor::Create(*allocDerived,
733736
reinterpret_cast<void *>(alloc.Element<char>(allocAt)), 0)};
734737
Assign(allocElement, source, terminator, NoAssignFlags, memmoveFct);
735738
}
736739
} else { // intrinsic type
737-
for (std::size_t n{alloc.Elements()}; n-- > 0;
738-
alloc.IncrementSubscripts(allocAt)) {
740+
for (std::size_t n{alloc.InlineElements()}; n-- > 0;
741+
alloc.IncrementSubscripts(allocAt)) {
739742
memmoveFct(alloc.Element<char>(allocAt), source.raw().base_addr,
740743
alloc.ElementBytes());
741744
}

0 commit comments

Comments
 (0)