Built on SYCL
Built on SYCL
Celerity is built on top of Khronos' emerging industry standard for accelerator programming.
Designed for HPC
Designed for HPC
Transparently run your workloads on large-scale accelerator clusters.
#include <celerity.h>
using namespace celerity;
// naive, but distributed matrix-vector multiplication!
int main() {
// (1) declare virtualized input and output buffers
constexpr size_t size = 256;
buffer<float, 2> matrix{{size, size}};
buffer<float, 1> vector{{size}};
buffer<float, 1> result{{size}};
distr_queue q;
q.submit([&](handler &cgh) {
// (2) specify data access patterns to enable distributed execution
accessor m(matrix, cgh, [size](chunk<1> chnk) {
return subrange<2>({chnk.offset[0], 0}, {chnk.range[0], size});
}, read_only);
accessor v(vector, cgh, access::one_to_one(), read_only);
accessor r(result, cgh, access::one_to_one(), write_only, no_init);
// (3) launch the parallel computation
cgh.parallel_for(range<1>(size), [=](item<1> item) {
r[item] = 0;
for (size_t i = 0; i < size; ++i) {
r[item] += m[item.get_id(0)][i] * v[i];
}
});
});
}
Complete Celerity Code Sample for Distributed Multi-GPU Matrix-vector Multiplication