High-Level C++ for Accelerated Clusters

Celerity aims to bring the power and ease of use of SYCL to distributed memory accelerator clusters.

Complete Celerity code sample for distributed multi-GPU matrix-vector multiplication
#include <celerity.h>
using namespace celerity;

// naive, but distributed matrix-vector multiplication!
int main() {
    // (1) declare virtualized input and output buffers
    constexpr size_t size = 256;
    buffer<float, 2> matrix{{size, size}};
    buffer<float, 1> vector{{size}};
    buffer<float, 1> result{{size}};

    queue q;
    q.submit([&](handler &cgh) {
        // (2) specify data access patterns to enable distributed execution
        accessor m(matrix, cgh, [size](chunk<1> chnk) {
            return subrange<2>({chnk.offset[0], 0}, {chnk.range[0], size});
        }, read_only);
        accessor v(vector, cgh, access::one_to_one(), read_only);
        accessor r(result, cgh, access::one_to_one(), write_only, no_init);

        // (3) launch the parallel computation
        cgh.parallel_for(range<1>(size), [=](item<1> item) {
            r[item] = 0;
            for (size_t i = 0; i < size; ++i) {
                r[item] += m[item.get_id(0)][i] * v[item];
            }
        });
    });
}