class generator_impl
Declaration
class generator_impl { /* full declaration omitted */ };
Member Variables
- instruction_graph* m_idag
- const task_manager* m_tm
- size_t m_num_nodes
- node_id m_local_nid
- system_info m_system
- instruction_graph_generator::delegate* m_delegate
- instruction_recorder* m_recorder
- instruction_graph_generator::policy_set m_policy
- instruction_id m_next_instruction_id = 0
- message_id m_next_message_id = 0
- instruction* m_last_horizon = nullptr
- instruction* m_last_epoch = nullptr
- std::unordered_set<instruction_id> m_execution_front
- The set of all instructions that are not yet depended upon by other instructions. These are collected by collapse_execution_front_to() as part of horizon / epoch generation.
- dense_map<memory_id, memory_state> m_memories
- std::unordered_map<buffer_id, buffer_state> m_buffers
- std::unordered_map<host_object_id, host_object_state> m_host_objects
- std::unordered_map<collective_group_id, collective_group_state> m_collective_groups
- std::vector<allocation_id> m_unreferenced_user_allocations
- The instruction executor maintains a mapping of allocation_id -> USM pointer. For IDAG-managed memory, these entries are deleted after executing a `free_instruction`, but since user allocations are not deallocated by us, we notify the executor on each horizon or epoch via the `instruction_garbage` struct about entries that will no longer be used and can therefore be collected. We include user allocations for buffer fences immediately after emitting the fence, and buffer host-initialization user allocations after the buffer has been destroyed.
- static const box<3> scalar_reduction_box = {zeros, ones}
Member Function Overview
- compile(const abstract_command & cmd) → void
- generator_impl(const task_manager & tm, size_t num_nodes, node_id local_nid, const system_info & system, instruction_graph & idag, instruction_graph_generator::delegate * dlg, instruction_recorder * recorder, const instruction_graph_generator::policy_set & policy)
- notify_buffer_created(buffer_id bid, const range<3> & range, size_t elem_size, size_t elem_align, allocation_id user_aid = null_allocation_id) → void
- notify_buffer_debug_name_changed(buffer_id bid, const std::string & name) → void
- notify_buffer_destroyed(buffer_id bid) → void
- notify_host_object_created(host_object_id hoid, bool owns_instance) → void
- notify_host_object_destroyed(host_object_id hoid) → void
- add_dependencies_on_last_concurrent_accesses(instruction * accessing_instruction, buffer_allocation_state & allocation, const region<3> & region, instruction_dependency_origin origin_for_read_write_front) → void
- add_dependencies_on_last_writers(instruction * accessing_instruction, buffer_allocation_state & allocation, const region<3> & region) → void
- add_dependency(instruction * from, instruction * to, instruction_dependency_origin record_origin) → void
- allocate_contiguously(batch & batch, buffer_id bid, memory_id mid, box_vector<3> && required_contiguous_boxes) → void
- apply_epoch(instruction * epoch) → void
- collapse_execution_front_to(instruction * horizon_or_epoch) → void
- commit_pending_region_receive_to_host_memory(batch & batch, buffer_id bid, const buffer_state::region_receive & receives, const std::vector<region<3>> & concurrent_reads) → void
- compile_epoch_command(batch & batch, const epoch_command & ecmd) → void
- compile_execution_command(batch & batch, const execution_command & ecmd) → void
- compile_fence_command(batch & batch, const fence_command & fcmd) → void
- compile_horizon_command(batch & batch, const horizon_command & hcmd) → void
- compile_push_command(batch & batch, const push_command & pcmd) → void
- compile_reduction_command(batch & batch, const reduction_command & rcmd) → void
- template <typename Instruction, typename... CtorParamsAndRecordWithFn>
create(batch & batch, CtorParamsAndRecordWithFn &&... ctor_args_and_record_with) → Instruction * - template <typename Instruction, typename... CtorParamsAndRecordWithFn, size_t... CtorParamIndices, size_t RecordWithFnIndex>
create_internal(batch & batch, const std::tuple<CtorParamsAndRecordWithFn...> & ctor_args_and_record_with, std::index_sequence<CtorParamIndices...>, std::index_sequence<RecordWithFnIndex>) → Instruction * - create_outbound_pilot(batch & batch, node_id target, const transfer_id & trid, const box<3> & box) → message_id
- create_task_collective_groups(batch & command_batch, const task & tsk) → void
- defer_await_push_command(const await_push_command & apcmd) → void
- establish_coherence_between_buffer_memories(batch & batch, buffer_id bid, memory_id dest_mid, const std::vector<region<3>> & concurrent_reads) → void
- finish_task_local_reduction(batch & command_batch, const local_reduction & red, const reduction_info & rinfo, const execution_command & ecmd, const task & tsk, const std::vector<localized_chunk> & concurrent_chunks) → void
- flush_batch(batch && batch) → void
- is_recording() const → bool
- launch_task_kernel(batch & command_batch, const execution_command & ecmd, const task & tsk, const localized_chunk & chunk) → instruction *
- new_allocation_id(memory_id mid) → allocation_id
- perform_atomic_write_to_allocation(instruction * writing_instruction, buffer_allocation_state & allocation, const region<3> & region) → void
- perform_concurrent_read_from_allocation(instruction * reading_instruction, buffer_allocation_state & allocation, const region<3> & region) → void
- perform_task_buffer_accesses(const task & tsk, const std::vector<localized_chunk> & concurrent_chunks, const std::vector<instruction *> & command_instructions) → void
- perform_task_collective_operations(const task & tsk, const std::vector<localized_chunk> & concurrent_chunks, const std::vector<instruction *> & command_instructions) → void
- perform_task_side_effects(const task & tsk, const std::vector<localized_chunk> & concurrent_chunks, const std::vector<instruction *> & command_instructions) → void
- prepare_task_local_reduction(batch & command_batch, const reduction_info & rinfo, const execution_command & ecmd, const task & tsk, size_t num_concurrent_chunks) → local_reduction
- print_buffer_debug_label(buffer_id bid) const → std::string
- report_task_overlapping_writes(const task & tsk, const std::vector<localized_chunk> & concurrent_chunks) const → void
- satisfy_task_buffer_requirements(batch & batch, buffer_id bid, const task & tsk, const subrange<3> & local_execution_range, bool is_reduction_initializer, const std::vector<localized_chunk> & concurrent_chunks_after_split) → void
- split_task_execution_range(const execution_command & ecmd, const task & tsk) → std::vector<localized_chunk>
Member Functions
¶void compile(const abstract_command& cmd)
void compile(const abstract_command& cmd)
Parameters
- const abstract_command& cmd
¶generator_impl(
const task_manager& tm,
size_t num_nodes,
node_id local_nid,
const system_info& system,
instruction_graph& idag,
instruction_graph_generator::delegate* dlg,
instruction_recorder* recorder,
const instruction_graph_generator::policy_set&
policy)
generator_impl(
const task_manager& tm,
size_t num_nodes,
node_id local_nid,
const system_info& system,
instruction_graph& idag,
instruction_graph_generator::delegate* dlg,
instruction_recorder* recorder,
const instruction_graph_generator::policy_set&
policy)
Parameters
- const task_manager& tm
- size_t num_nodes
- node_id local_nid
- const system_info& system
- instruction_graph& idag
- instruction_graph_generator::delegate* dlg
- instruction_recorder* recorder
- const instruction_graph_generator::policy_set& policy
¶void notify_buffer_created(
buffer_id bid,
const range<3>& range,
size_t elem_size,
size_t elem_align,
allocation_id user_aid = null_allocation_id)
void notify_buffer_created(
buffer_id bid,
const range<3>& range,
size_t elem_size,
size_t elem_align,
allocation_id user_aid = null_allocation_id)
Parameters
- buffer_id bid
- const range<3>& range
- size_t elem_size
- size_t elem_align
- allocation_id user_aid = null_allocation_id
¶void notify_buffer_debug_name_changed(
buffer_id bid,
const std::string& name)
void notify_buffer_debug_name_changed(
buffer_id bid,
const std::string& name)
Parameters
- buffer_id bid
- const std::string& name
¶void notify_buffer_destroyed(buffer_id bid)
void notify_buffer_destroyed(buffer_id bid)
Parameters
- buffer_id bid
¶void notify_host_object_created(
host_object_id hoid,
bool owns_instance)
void notify_host_object_created(
host_object_id hoid,
bool owns_instance)
Parameters
- host_object_id hoid
- bool owns_instance
¶void notify_host_object_destroyed(
host_object_id hoid)
void notify_host_object_destroyed(
host_object_id hoid)
Parameters
- host_object_id hoid
¶void add_dependencies_on_last_concurrent_accesses(
instruction* accessing_instruction,
buffer_allocation_state& allocation,
const region<3>& region,
instruction_dependency_origin
origin_for_read_write_front)
void add_dependencies_on_last_concurrent_accesses(
instruction* accessing_instruction,
buffer_allocation_state& allocation,
const region<3>& region,
instruction_dependency_origin
origin_for_read_write_front)
Parameters
- instruction* accessing_instruction
- buffer_allocation_state& allocation
- const region<3>& region
- instruction_dependency_origin origin_for_read_write_front
¶void add_dependencies_on_last_writers(
instruction* accessing_instruction,
buffer_allocation_state& allocation,
const region<3>& region)
void add_dependencies_on_last_writers(
instruction* accessing_instruction,
buffer_allocation_state& allocation,
const region<3>& region)
Parameters
- instruction* accessing_instruction
- buffer_allocation_state& allocation
- const region<3>& region
¶void add_dependency(
instruction* from,
instruction* to,
instruction_dependency_origin record_origin)
void add_dependency(
instruction* from,
instruction* to,
instruction_dependency_origin record_origin)
Description
Inserts a graph dependency and removes to
form the execution front (if present). The record_origin
is debug information.
Parameters
- instruction* from
- instruction* to
- instruction_dependency_origin record_origin
¶void allocate_contiguously(
batch& batch,
buffer_id bid,
memory_id mid,
box_vector<3>&& required_contiguous_boxes)
void allocate_contiguously(
batch& batch,
buffer_id bid,
memory_id mid,
box_vector<3>&& required_contiguous_boxes)
Description
Ensure that all boxes in required_contiguous_boxes
have a contiguous allocation on mid
. Re-allocation of one buffer on one memory never interacts with other buffers or other memories backing the same buffer, this function can be called in any order of allocation requirements without generating additional dependencies.
Parameters
¶void apply_epoch(instruction* epoch)
void apply_epoch(instruction* epoch)
Description
Replace all tracked instructions that older than epoch
with epoch
.
Parameters
- instruction* epoch
¶void collapse_execution_front_to(
instruction* horizon_or_epoch)
void collapse_execution_front_to(
instruction* horizon_or_epoch)
Description
Add dependencies from horizon_or_epoch
to all instructions in m_execution_front
and clear the set.
Parameters
- instruction* horizon_or_epoch
¶void commit_pending_region_receive_to_host_memory(
batch& batch,
buffer_id bid,
const buffer_state::region_receive& receives,
const std::vector<region<3>>&
concurrent_reads)
void commit_pending_region_receive_to_host_memory(
batch& batch,
buffer_id bid,
const buffer_state::region_receive& receives,
const std::vector<region<3>>&
concurrent_reads)
Description
Insert one or more receive instructions in order to fulfil a pending receive, making the received data available in host_memory_id. This may entail receiving a region that is larger than the union of all regions read.
Parameters
- batch& batch
- buffer_id bid
- const buffer_state::region_receive& receives
- const std::vector<region<3>>& concurrent_reads
¶void compile_epoch_command(
batch& batch,
const epoch_command& ecmd)
void compile_epoch_command(
batch& batch,
const epoch_command& ecmd)
Parameters
- batch& batch
- const epoch_command& ecmd
¶void compile_execution_command(
batch& batch,
const execution_command& ecmd)
void compile_execution_command(
batch& batch,
const execution_command& ecmd)
Parameters
- batch& batch
- const execution_command& ecmd
¶void compile_fence_command(
batch& batch,
const fence_command& fcmd)
void compile_fence_command(
batch& batch,
const fence_command& fcmd)
Parameters
- batch& batch
- const fence_command& fcmd
¶void compile_horizon_command(
batch& batch,
const horizon_command& hcmd)
void compile_horizon_command(
batch& batch,
const horizon_command& hcmd)
Parameters
- batch& batch
- const horizon_command& hcmd
¶void compile_push_command(
batch& batch,
const push_command& pcmd)
void compile_push_command(
batch& batch,
const push_command& pcmd)
Parameters
- batch& batch
- const push_command& pcmd
¶void compile_reduction_command(
batch& batch,
const reduction_command& rcmd)
void compile_reduction_command(
batch& batch,
const reduction_command& rcmd)
Parameters
- batch& batch
- const reduction_command& rcmd
¶template <typename Instruction,
typename... CtorParamsAndRecordWithFn>
Instruction* create(
batch& batch,
CtorParamsAndRecordWithFn&&... ctor_args_and_record_with)
template <typename Instruction,
typename... CtorParamsAndRecordWithFn>
Instruction* create(
batch& batch,
CtorParamsAndRecordWithFn&&... ctor_args_and_record_with)
Description
Create an instruction, insert it into the IDAG and the current execution front, and record it if a recorder is present. Invoke as create <instruction -type>(instruction-ctor-params..., [ & ](const auto record_debug_info) { return record_debug_info(instruction-record-additional-ctor-params)})
Template Parameters
- Instruction
- CtorParamsAndRecordWithFn
Parameters
- batch& batch
- CtorParamsAndRecordWithFn&&... ctor_args_and_record_with
¶template <typename Instruction,
typename... CtorParamsAndRecordWithFn,
size_t... CtorParamIndices,
size_t RecordWithFnIndex>
Instruction* create_internal(
batch& batch,
const std::tuple<
CtorParamsAndRecordWithFn...>&
ctor_args_and_record_with,
std::index_sequence<CtorParamIndices...>,
std::index_sequence<RecordWithFnIndex>)
template <typename Instruction,
typename... CtorParamsAndRecordWithFn,
size_t... CtorParamIndices,
size_t RecordWithFnIndex>
Instruction* create_internal(
batch& batch,
const std::tuple<
CtorParamsAndRecordWithFn...>&
ctor_args_and_record_with,
std::index_sequence<CtorParamIndices...>,
std::index_sequence<RecordWithFnIndex>)
Template Parameters
- Instruction
- CtorParamsAndRecordWithFn
- size_t CtorParamIndices
- size_t RecordWithFnIndex
Parameters
- batch& batch
- const std::tuple<CtorParamsAndRecordWithFn...>& ctor_args_and_record_with
- std::index_sequence<CtorParamIndices...>
- std::index_sequence<RecordWithFnIndex>
¶message_id create_outbound_pilot(
batch& batch,
node_id target,
const transfer_id& trid,
const box<3>& box)
message_id create_outbound_pilot(
batch& batch,
node_id target,
const transfer_id& trid,
const box<3>& box)
Parameters
- batch& batch
- node_id target
- const transfer_id& trid
- const box<3>& box
¶void create_task_collective_groups(
batch& command_batch,
const task& tsk)
void create_task_collective_groups(
batch& command_batch,
const task& tsk)
Description
Issue instructions to create any collective group required by a task.
Parameters
¶void defer_await_push_command(
const await_push_command& apcmd)
void defer_await_push_command(
const await_push_command& apcmd)
Parameters
- const await_push_command& apcmd
¶void establish_coherence_between_buffer_memories(
batch& batch,
buffer_id bid,
memory_id dest_mid,
const std::vector<region<3>>&
concurrent_reads)
void establish_coherence_between_buffer_memories(
batch& batch,
buffer_id bid,
memory_id dest_mid,
const std::vector<region<3>>&
concurrent_reads)
Description
Insert coherence copy instructions where necessary to make dest_mid
coherent for all concurrent_reads
. Requires the necessary allocations in dest_mid
to already be present. We deliberately allow overlapping read-regions to avoid aggregated copies introducing synchronization points between otherwise independent instructions.
Parameters
- batch& batch
- buffer_id bid
- memory_id dest_mid
- const std::vector<region<3>>& concurrent_reads
¶void finish_task_local_reduction(
batch& command_batch,
const local_reduction& red,
const reduction_info& rinfo,
const execution_command& ecmd,
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks)
void finish_task_local_reduction(
batch& command_batch,
const local_reduction& red,
const reduction_info& rinfo,
const execution_command& ecmd,
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks)
Description
Combine any partial reduction results computed by local chunks and write it to buffer host memory.
Parameters
- batch& command_batch
- const local_reduction& red
- const reduction_info& rinfo
- const execution_command& ecmd
- const task& tsk
- const std::vector<localized_chunk>& concurrent_chunks
¶void flush_batch(batch&& batch)
void flush_batch(batch&& batch)
Description
Passes all instructions and outbound pilots that have been accumulated in batch
to the delegate (if any). Called after compiling a command, creating or destroying a buffer or host object, and also in our constructor for the creation of the initial epoch.
Parameters
- batch&& batch
¶bool is_recording() const
bool is_recording() const
Description
True if a recorder is present and create() will call the record_with
lambda passed as its last parameter.
¶instruction* launch_task_kernel(
batch& command_batch,
const execution_command& ecmd,
const task& tsk,
const localized_chunk& chunk)
instruction* launch_task_kernel(
batch& command_batch,
const execution_command& ecmd,
const task& tsk,
const localized_chunk& chunk)
Description
Launch a device kernel for each local chunk of a task, passing the relevant buffer allocations in place of accessors and reduction descriptors.
Parameters
- batch& command_batch
- const execution_command& ecmd
- const task& tsk
- const localized_chunk& chunk
¶allocation_id new_allocation_id(memory_id mid)
allocation_id new_allocation_id(memory_id mid)
Parameters
- memory_id mid
¶void perform_atomic_write_to_allocation(
instruction* writing_instruction,
buffer_allocation_state& allocation,
const region<3>& region)
void perform_atomic_write_to_allocation(
instruction* writing_instruction,
buffer_allocation_state& allocation,
const region<3>& region)
Description
Add dependencies to the last concurrent accesses of a region, and track the instruction as the new last (unique) writer.
Parameters
- instruction* writing_instruction
- buffer_allocation_state& allocation
- const region<3>& region
¶void perform_concurrent_read_from_allocation(
instruction* reading_instruction,
buffer_allocation_state& allocation,
const region<3>& region)
void perform_concurrent_read_from_allocation(
instruction* reading_instruction,
buffer_allocation_state& allocation,
const region<3>& region)
Description
Add dependencies to the last writer of a region, and track the instruction as the new last (concurrent) reader.
Parameters
- instruction* reading_instruction
- buffer_allocation_state& allocation
- const region<3>& region
¶void perform_task_buffer_accesses(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks,
const std::vector<instruction*>&
command_instructions)
void perform_task_buffer_accesses(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks,
const std::vector<instruction*>&
command_instructions)
Description
Add dependencies for all buffer accesses and reductions of a task, then update tracking structures accordingly.
Parameters
- const task& tsk
- const std::vector<localized_chunk>& concurrent_chunks
- const std::vector<instruction*>& command_instructions
¶void perform_task_collective_operations(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks,
const std::vector<instruction*>&
command_instructions)
void perform_task_collective_operations(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks,
const std::vector<instruction*>&
command_instructions)
Description
If a task is part of a collective group, serialize it with respect to the last host task in that group.
Parameters
- const task& tsk
- const std::vector<localized_chunk>& concurrent_chunks
- const std::vector<instruction*>& command_instructions
¶void perform_task_side_effects(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks,
const std::vector<instruction*>&
command_instructions)
void perform_task_side_effects(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks,
const std::vector<instruction*>&
command_instructions)
Description
If a task has side effects, serialize it with respect to the last task that shares a host object.
Parameters
- const task& tsk
- const std::vector<localized_chunk>& concurrent_chunks
- const std::vector<instruction*>& command_instructions
¶local_reduction prepare_task_local_reduction(
batch& command_batch,
const reduction_info& rinfo,
const execution_command& ecmd,
const task& tsk,
size_t num_concurrent_chunks)
local_reduction prepare_task_local_reduction(
batch& command_batch,
const reduction_info& rinfo,
const execution_command& ecmd,
const task& tsk,
size_t num_concurrent_chunks)
Description
Create a gather allocation and optionally save the current buffer value before creating partial reduction results in any kernel.
Parameters
- batch& command_batch
- const reduction_info& rinfo
- const execution_command& ecmd
- const task& tsk
- size_t num_concurrent_chunks
¶std::string print_buffer_debug_label(
buffer_id bid) const
std::string print_buffer_debug_label(
buffer_id bid) const
Parameters
- buffer_id bid
¶void report_task_overlapping_writes(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks) const
void report_task_overlapping_writes(
const task& tsk,
const std::vector<localized_chunk>&
concurrent_chunks) const
Description
Detect overlapping writes between local chunks of a task and report it according to m_policy.
Parameters
- const task& tsk
- const std::vector<localized_chunk>& concurrent_chunks
¶void satisfy_task_buffer_requirements(
batch& batch,
buffer_id bid,
const task& tsk,
const subrange<3>& local_execution_range,
bool is_reduction_initializer,
const std::vector<localized_chunk>&
concurrent_chunks_after_split)
void satisfy_task_buffer_requirements(
batch& batch,
buffer_id bid,
const task& tsk,
const subrange<3>& local_execution_range,
bool is_reduction_initializer,
const std::vector<localized_chunk>&
concurrent_chunks_after_split)
Description
Allocate memory, apply any pending receives, and issue resize- and coherence copies to prepare all buffer memories for a task's execution.
Parameters
- batch& batch
- buffer_id bid
- const task& tsk
- const subrange<3>& local_execution_range
- bool is_reduction_initializer
- const std::vector<localized_chunk>& concurrent_chunks_after_split
¶std::vector<localized_chunk>
split_task_execution_range(
const execution_command& ecmd,
const task& tsk)
std::vector<localized_chunk>
split_task_execution_range(
const execution_command& ecmd,
const task& tsk)
Description
Split a tasks local execution range (given by execution_command) into chunks according to device configuration and a possible oversubscription hint.
Parameters
- const execution_command& ecmd
- const task& tsk