2#include "internal/kmeans_gpu.h"
19#include "internal/Image.h"
20#include "internal/LABAPixel.h"
21#include "internal/PixelConverters.h"
22#include "internal/RGBAPixel.h"
23#include "internal/cielab.h"
24#include "internal/gpu.h"
26static constexpr uint8_t COLOR_SPACE_OPTION_CIELAB{0};
27static constexpr uint8_t COLOR_SPACE_OPTION_RGB{1};
31 uint32_t numCentroids;
33} __attribute__((packed));
40} __attribute__((packed));
46} __attribute__((packed));
49template <
typename PixelT>
52 const uint8_t color_space) {
55 size_t width = pixels.getWidth();
56 size_t height = pixels.getHeight();
57 size_t num_pixels = width * height;
59 std::vector<PixelT> centroids;
64 wgpu::TextureDescriptor texDesc = {};
65 texDesc.size = {
static_cast<uint32_t
>(width),
static_cast<uint32_t
>(height), 1};
66 texDesc.format = wgpu::TextureFormat::RGBA32Float;
67 texDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::CopyDst;
68 texDesc.label =
"inputTextureInit";
69 wgpu::Texture inputTexture = GPU::getClassInstance().get_device().CreateTexture(&texDesc);
72 std::vector<float> gpu_pixels;
73 gpu_pixels.reserve(num_pixels * 4);
75 for (
int i = 0; i < num_pixels; i++) {
77 if constexpr (std::is_same_v<PixelT, ImageLib::LABAPixel<float>>) {
78 gpu_pixels.push_back(p.l / 255.0f);
79 gpu_pixels.push_back(p.a / 255.0f);
80 gpu_pixels.push_back(p.b / 255.0f);
81 gpu_pixels.push_back(p.alpha / 255.0f);
83 gpu_pixels.push_back(p.red / 255.0f);
84 gpu_pixels.push_back(p.green / 255.0f);
85 gpu_pixels.push_back(p.blue / 255.0f);
86 gpu_pixels.push_back(p.alpha / 255.0f);
90 wgpu::TexelCopyTextureInfo texDst = {};
91 texDst.texture = inputTexture;
92 wgpu::TexelCopyBufferLayout texLayout = {};
93 texLayout.bytesPerRow = width * 16;
94 texLayout.rowsPerImage = height;
95 GPU::getClassInstance().get_queue().WriteTexture(
96 &texDst, gpu_pixels.data(), gpu_pixels.size() * 4, &texLayout, &texDesc.size);
100 std::vector<float> initial_dists(num_pixels, std::numeric_limits<float>::max());
102 wgpu::BufferDescriptor distDesc = {};
103 distDesc.size = num_pixels *
sizeof(float);
105 wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
106 wgpu::Buffer minDistBuffer = GPU::getClassInstance().get_device().CreateBuffer(&distDesc);
107 GPU::getClassInstance().get_queue().WriteBuffer(minDistBuffer, 0, initial_dists.data(),
112 wgpu::BufferDescriptor uniDesc = {};
114 uniDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;
115 wgpu::Buffer paramBuffer = GPU::getClassInstance().get_device().CreateBuffer(&uniDesc);
118 wgpu::BufferDescriptor readDesc = {};
119 readDesc.size = num_pixels *
sizeof(float);
120 readDesc.usage = wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst;
121 wgpu::Buffer readBuffer = GPU::getClassInstance().get_device().CreateBuffer(&readDesc);
124 wgpu::ComputePipeline pipeline =
125 GPU::getClassInstance().createPipeline(
"dist_shader",
"updateDistShader");
128 wgpu::BindGroupEntry entries[3];
129 entries[0].binding = 0;
130 entries[0].textureView = inputTexture.CreateView();
131 entries[1].binding = 1;
132 entries[1].buffer = minDistBuffer;
133 entries[1].size = distDesc.size;
134 entries[2].binding = 2;
135 entries[2].buffer = paramBuffer;
136 entries[2].size = uniDesc.size;
138 wgpu::BindGroupDescriptor bgDesc = {};
139 bgDesc.layout = pipeline.GetBindGroupLayout(0);
140 bgDesc.entryCount = 3;
141 bgDesc.entries = entries;
142 wgpu::BindGroup bindGroup = GPU::getClassInstance().get_device().CreateBindGroup(&bgDesc);
146 std::random_device rd;
147 std::mt19937 gen(rd());
150 std::uniform_int_distribution<> dis(0, num_pixels - 1);
151 int first_index = dis(gen);
152 centroids.push_back(pixels[first_index]);
156 bool* done =
new bool(
false);
158 for (
int i = 1; i < k; ++i) {
161 PixelT c = centroids.back();
163 if constexpr (std::is_same_v<PixelT, ImageLib::LABAPixel<float>>) {
164 params =
CentroidParams{c.l / 255.0f, c.a / 255.0f, c.b / 255.0f, 1.0f,
165 static_cast<uint32_t
>(width)};
167 params =
CentroidParams{c.red / 255.0f, c.green / 255.0f, c.blue / 255.0f, 1.0f,
168 static_cast<uint32_t
>(width)};
171 GPU::getClassInstance().get_queue().WriteBuffer(paramBuffer, 0, ¶ms,
175 wgpu::CommandEncoder encoder = GPU::getClassInstance().get_device().CreateCommandEncoder();
176 wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
177 pass.SetPipeline(pipeline);
178 pass.SetBindGroup(0, bindGroup);
179 pass.DispatchWorkgroups((width + 15) / 16, (height + 15) / 16, 1);
183 encoder.CopyBufferToBuffer(minDistBuffer, 0, readBuffer, 0, readDesc.size);
184 wgpu::CommandBuffer commands = encoder.Finish();
185 GPU::getClassInstance().get_queue().Submit(1, &commands);
190 wgpu::MapMode::Read, 0, readDesc.size, wgpu::CallbackMode::AllowProcessEvents,
191 [](wgpu::MapAsyncStatus status, wgpu::StringView msg,
void* userdata) {
192 bool* flag = static_cast<bool*>(userdata);
193 bool success = false;
194 if (status == wgpu::MapAsyncStatus::Success) {
208 GPU::getClassInstance().get_instance().ProcessEvents();
209#if defined(__EMSCRIPTEN__)
210 emscripten_sleep(10);
214 const float* dists = (
const float*)readBuffer.GetConstMappedRange();
216 double sum_dist_sq = 0.0;
220 for (
size_t j = 0; j < num_pixels; ++j) {
221 sum_dist_sq += dists[j];
225 std::uniform_real_distribution<> dist_selector(0.0, sum_dist_sq);
226 double random_value = dist_selector(gen);
227 double current_sum = 0.0;
228 int selected_index = -1;
230 for (
size_t j = 0; j < num_pixels; ++j) {
231 current_sum += dists[j];
232 if (current_sum >= random_value) {
238 if (selected_index == -1) selected_index = num_pixels - 1;
241 centroids.push_back(pixels[selected_index]);
243#if defined(__EMSCRIPTEN__)
244 emscripten_sleep(10);
248 std::copy(centroids.begin(), centroids.end(), out_centroids.begin());
251 if (inputTexture) inputTexture.Destroy();
252 readBuffer.Destroy();
253 minDistBuffer.Destroy();
254 paramBuffer.Destroy();
257#if defined(__EMSCRIPTEN__)
258 emscripten_sleep(50);
266 const int32_t height,
const int32_t k, wgpu::Texture& inputTexture,
267 wgpu::Texture& labelTexture, wgpu::Texture& centroidTexture,
268 wgpu::TextureDescriptor& labelDesc, wgpu::TextureDescriptor& centroidDesc,
269 wgpu::ComputePipeline& pipeline1, wgpu::ComputePipeline& pipeline2,
270 wgpu::BindGroup& bindGroup1, wgpu::BindGroup& bindGroup2,
const uint8_t color_space) {
271 int bytesPerPixel{16};
272 const int32_t num_pixels{pixels.getSize()};
274 wgpu::TextureDescriptor texDesc = {};
275 texDesc.size = {
static_cast<uint32_t
>(width),
static_cast<uint32_t
>(height), 1};
276 texDesc.format = wgpu::TextureFormat::RGBA32Float;
277 texDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::CopyDst;
278 texDesc.label =
"inputTexture";
279 inputTexture = GPU::getClassInstance().get_device().CreateTexture(&texDesc);
281 wgpu::TexelCopyTextureInfo dst = {};
282 dst.texture = inputTexture;
283 wgpu::TexelCopyBufferLayout layout = {};
285 layout.bytesPerRow = width * bytesPerPixel;
286 layout.rowsPerImage = height;
288 std::vector<float> pixels_;
289 for (
int i = 0; i < num_pixels; i++) {
290 switch (color_space) {
291 case COLOR_SPACE_OPTION_RGB: {
293 pixels_.push_back(p.red / 255.0f);
294 pixels_.push_back(p.green / 255.0f);
295 pixels_.push_back(p.blue / 255.0f);
296 pixels_.push_back(p.alpha / 255.0f);
299 case COLOR_SPACE_OPTION_CIELAB: {
301 pixels_.push_back(p.l / 255.0f);
302 pixels_.push_back(p.a / 255.0f);
303 pixels_.push_back(p.b / 255.0f);
304 pixels_.push_back(p.alpha / 255.0f);
310 GPU::getClassInstance().get_queue().WriteTexture(
311 &dst, pixels_.data(), pixels_.size() *
sizeof(
float), &layout, &texDesc.size);
314 centroidDesc.size = {
static_cast<uint32_t
>(k), 1, 1};
315 centroidDesc.format = wgpu::TextureFormat::RGBA32Float;
316 centroidDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::StorageBinding |
317 wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::CopySrc;
318 centroidDesc.label =
"centroidTexture";
319 centroidTexture = GPU::getClassInstance().get_device().CreateTexture(¢roidDesc);
321 wgpu::TexelCopyTextureInfo cdst = {};
322 cdst.texture = centroidTexture;
323 wgpu::TexelCopyBufferLayout clayout = {};
325 clayout.bytesPerRow = k * bytesPerPixel;
326 clayout.rowsPerImage = 1;
328 std::vector<float> centroids_;
329 switch (color_space) {
330 case COLOR_SPACE_OPTION_RGB: {
331 for (
int i = 0; i < k; i++) {
332 auto p = centroids[i];
333 centroids_.push_back(p.red / 255.0f);
334 centroids_.push_back(p.green / 255.0f);
335 centroids_.push_back(p.blue / 255.0f);
336 centroids_.push_back(p.alpha / 255.0f);
340 case COLOR_SPACE_OPTION_CIELAB: {
341 for (
int i = 0; i < k; i++) {
342 auto p = centroids_lab[i];
343 centroids_.push_back(p.l / 255.0f);
344 centroids_.push_back(p.a / 255.0f);
345 centroids_.push_back(p.b / 255.0f);
346 centroids_.push_back(p.alpha / 255.0f);
352 GPU::getClassInstance().get_queue().WriteTexture(
353 &cdst, centroids_.data(), centroids_.size() *
sizeof(
float), &clayout, ¢roidDesc.size);
356 labelDesc.size = {
static_cast<uint32_t
>(width),
static_cast<uint32_t
>(height), 1};
357 labelDesc.format = wgpu::TextureFormat::RGBA32Uint;
358 labelDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::StorageBinding |
359 wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::CopySrc;
360 labelDesc.label =
"labelTexture";
361 labelTexture = GPU::getClassInstance().get_device().CreateTexture(&labelDesc);
364 Params params = {
static_cast<uint32_t
>(num_pixels),
static_cast<uint32_t
>(k)};
365 wgpu::BufferDescriptor bufDesc = {};
366 bufDesc.size =
sizeof(
Params);
367 bufDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;
368 wgpu::Buffer paramBuffer = GPU::getClassInstance().get_device().CreateBuffer(&bufDesc);
369 GPU::getClassInstance().get_queue().WriteBuffer(paramBuffer, 0, ¶ms,
sizeof(
Params));
372 std::vector<ClusterAccumulator> reset_centroids(k, {0, 0, 0, 0});
373 wgpu::BufferDescriptor accDesc = {};
375 accDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopyDst;
376 wgpu::Buffer accBuffer = GPU::getClassInstance().get_device().CreateBuffer(&accDesc);
377 GPU::getClassInstance().get_queue().WriteBuffer(accBuffer, 0, reset_centroids.data(),
382 GPU::getClassInstance().createPipeline(
"assign_update_shader",
"assignUpdateShader");
383 pipeline2 = GPU::getClassInstance().createPipeline(
"resolve_shader",
"resolveShader");
386 wgpu::BindGroupDescriptor bindGroupDesc1 = {};
387 bindGroupDesc1.layout = pipeline1.GetBindGroupLayout(0);
388 wgpu::BindGroupEntry entries1[5];
390 entries1[0].binding = 0;
391 entries1[0].textureView = inputTexture.CreateView();
393 entries1[1].binding = 1;
394 entries1[1].textureView = centroidTexture.CreateView();
396 entries1[2].binding = 2;
397 entries1[2].textureView = labelTexture.CreateView();
399 entries1[3].binding = 3;
400 entries1[3].buffer = paramBuffer;
401 entries1[3].size =
sizeof(
Params);
403 entries1[4].binding = 4;
404 entries1[4].buffer = accBuffer;
407 bindGroupDesc1.entryCount = 5;
408 bindGroupDesc1.entries = entries1;
409 bindGroup1 = GPU::getClassInstance().get_device().CreateBindGroup(&bindGroupDesc1);
411 wgpu::BindGroupDescriptor bindGroupDesc2 = {};
412 bindGroupDesc2.layout = pipeline2.GetBindGroupLayout(0);
413 wgpu::BindGroupEntry entries2[2];
414 entries2[0].binding = 0;
415 entries2[0].buffer = accBuffer;
416 entries2[0].size = accDesc.size;
417 entries2[1].binding = 1;
418 entries2[1].textureView = centroidTexture.CreateView();
419 bindGroupDesc2.entryCount = 2;
420 bindGroupDesc2.entries = entries2;
421 bindGroup2 = GPU::getClassInstance().get_device().CreateBindGroup(&bindGroupDesc2);
424void kmeans_gpu(
const uint8_t* data, uint8_t* out_data, int32_t* out_labels,
const int32_t width,
425 const int32_t height,
const int32_t k,
const int32_t max_iter,
426 const uint8_t color_space) {
428 pixels.loadFromBuffer(data, width, height, ImageLib::RGBA_CONVERTER<float>);
429 const int32_t num_pixels{pixels.getSize()};
436 std::vector<int32_t> labels(num_pixels, -1);
440 if (color_space == COLOR_SPACE_OPTION_CIELAB) {
441 for (
int i{0}; i < pixels.getSize(); ++i) {
442 rgb_to_lab<float, float>(pixels[i], lab[i]);
446 std::cout <<
"starting" << std::endl;
449 switch (color_space) {
450 case COLOR_SPACE_OPTION_RGB: {
451 kMeansPlusPlusInitGpu<ImageLib::RGBAPixel<float>>(pixels, centroids, k, color_space);
454 case COLOR_SPACE_OPTION_CIELAB: {
455 kMeansPlusPlusInitGpu<ImageLib::LABAPixel<float>>(lab, centroids_lab, k, color_space);
459 std::cout <<
"kmeans++ init done" << std::endl;
462 int bytesPerPixel{16};
467 wgpu::ComputePipeline pipeline1;
468 wgpu::ComputePipeline pipeline2;
469 wgpu::BindGroup bindGroup1;
470 wgpu::BindGroup bindGroup2;
471 wgpu::Texture inputTexture;
472 wgpu::Texture labelTexture;
473 wgpu::Texture centroidTexture;
474 wgpu::TextureDescriptor labelDesc = {};
475 wgpu::TextureDescriptor centroidDesc = {};
478 setup(pixels, lab, centroids, centroids_lab, width, height, k, inputTexture, labelTexture,
479 centroidTexture, labelDesc, centroidDesc, pipeline1, pipeline2, bindGroup1, bindGroup2,
482 uint32_t wgX = (width + 15) / 16;
483 uint32_t wgY = (height + 15) / 16;
486 uint32_t bytesPerRowLabels =
487 GPU::getAlignedBytesPerRow(width,
static_cast<uint32_t
>(bytesPerPixel));
488 wgpu::BufferDescriptor readLabelsDesc = {};
489 readLabelsDesc.size = bytesPerRowLabels * height;
490 readLabelsDesc.usage = wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst;
491 wgpu::Buffer readLabelsBuffer =
492 GPU::getClassInstance().get_device().CreateBuffer(&readLabelsDesc);
495 uint32_t bytesPerRowCentroids =
496 GPU::getAlignedBytesPerRow(width,
static_cast<uint32_t
>(bytesPerPixel));
497 wgpu::BufferDescriptor readCentroidsDesc = {};
498 readCentroidsDesc.size = bytesPerRowCentroids;
499 readCentroidsDesc.usage = wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst;
500 wgpu::Buffer readCentroidsBuffer =
501 GPU::getClassInstance().get_device().CreateBuffer(&readCentroidsDesc);
504 std::cout <<
"start iterations" << std::endl;
505 wgpu::CommandEncoder encoder = GPU::getClassInstance().get_device().CreateCommandEncoder();
506 for (int32_t iter{0}; iter < max_iter; ++iter) {
507 wgpu::ComputePassEncoder pass1 = encoder.BeginComputePass();
508 pass1.SetPipeline(pipeline1);
509 pass1.SetBindGroup(0, bindGroup1);
510 pass1.DispatchWorkgroups(wgX, wgY);
513 wgpu::ComputePassEncoder pass2 = encoder.BeginComputePass();
514 pass2.SetPipeline(pipeline2);
515 pass2.SetBindGroup(0, bindGroup2);
516 pass2.DispatchWorkgroups((k + 255) / 256, 1);
523 wgpu::TexelCopyTextureInfo srcLabels = {};
524 srcLabels.texture = labelTexture;
525 wgpu::TexelCopyBufferInfo dstLabels = {};
526 dstLabels.buffer = readLabelsBuffer;
527 dstLabels.layout.bytesPerRow = bytesPerRowLabels;
528 dstLabels.layout.rowsPerImage = height;
529 encoder.CopyTextureToBuffer(&srcLabels, &dstLabels, &labelDesc.size);
532 wgpu::TexelCopyTextureInfo srcCentroids = {};
533 srcCentroids.texture = centroidTexture;
534 wgpu::TexelCopyBufferInfo dstCentroids = {};
535 dstCentroids.buffer = readCentroidsBuffer;
536 dstCentroids.layout.bytesPerRow = bytesPerRowCentroids;
537 dstCentroids.layout.rowsPerImage = 1;
538 encoder.CopyTextureToBuffer(&srcCentroids, &dstCentroids, ¢roidDesc.size);
540 wgpu::CommandBuffer commands = encoder.Finish();
541 GPU::getClassInstance().get_queue().Submit(1, &commands);
542 std::cout <<
"done iterations" << std::endl;
545 bool* done1 =
new bool(
false);
546 bool* done2 =
new bool(
false);
549 readLabelsBuffer.MapAsync(
550 wgpu::MapMode::Read, 0, readLabelsDesc.size, wgpu::CallbackMode::AllowProcessEvents,
551 [](wgpu::MapAsyncStatus status, wgpu::StringView msg,
void* userdata) {
552 bool* flag = static_cast<bool*>(userdata);
553 bool success = false;
554 if (status == wgpu::MapAsyncStatus::Success) {
562 std::cout <<
"read out" << std::endl;
565 GPU::getClassInstance().get_instance().ProcessEvents();
566#if defined(__EMSCRIPTEN__)
567 emscripten_sleep(10);
571 std::cout <<
"mapping labels" << std::endl;
572 const uint8_t* mappedData = (
const uint8_t*)readLabelsBuffer.GetConstMappedRange();
575 for (
size_t y = 0; y < height; ++y) {
576 const uint8_t* rowPtr = mappedData + (y * bytesPerRowLabels);
577 for (
size_t x = 0; x < width; ++x) {
578 const uint8_t* pixelPtr = rowPtr + (x * bytesPerPixel);
580 std::memcpy(&r, pixelPtr,
sizeof(uint32_t));
582 size_t dstIndex = y * width + x;
583 labels[dstIndex] =
static_cast<int32_t
>(r);
587 readLabelsBuffer.Unmap();
590 readCentroidsBuffer.MapAsync(
591 wgpu::MapMode::Read, 0, readCentroidsDesc.size, wgpu::CallbackMode::AllowProcessEvents,
592 [](wgpu::MapAsyncStatus status, wgpu::StringView msg,
void* userdata) {
593 bool* flag =
static_cast<bool*
>(userdata);
594 bool success =
false;
595 if (status == wgpu::MapAsyncStatus::Success) {
604 GPU::getClassInstance().get_instance().ProcessEvents();
605#if defined(__EMSCRIPTEN__)
606 emscripten_sleep(10);
610 std::cout <<
"mapping centroids" << std::endl;
611 const float* mappedDataFloat = (
const float*)readCentroidsBuffer.GetConstMappedRange();
614 for (
int i = 0; i < k; i++) {
616 const float* centroidPtr = mappedDataFloat + (i * 4);
618 float r = *(centroidPtr);
619 float g = *(centroidPtr + 1);
620 float b = *(centroidPtr + 2);
621 float a = *(centroidPtr + 3);
622 switch (color_space) {
623 case COLOR_SPACE_OPTION_RGB: {
628 case COLOR_SPACE_OPTION_CIELAB: {
636 readCentroidsBuffer.Unmap();
639 if (color_space == COLOR_SPACE_OPTION_CIELAB) {
640 for (int32_t i{0}; i < k; ++i) {
641 lab_to_rgb<float, float>(centroids_lab[i], centroids[i]);
645 for (int32_t i = 0; i < num_pixels; ++i) {
646 const int32_t cluster = labels[i];
647 out_data[i * 4 + 0] =
static_cast<uint8_t
>(centroids[cluster].red);
648 out_data[i * 4 + 1] =
static_cast<uint8_t
>(centroids[cluster].green);
649 out_data[i * 4 + 2] =
static_cast<uint8_t
>(centroids[cluster].blue);
650 out_data[i * 4 + 3] = 255;
654 std::cout <<
"copying labels out" << std::endl;
655 std::memcpy(out_labels, labels.data(), labels.size() *
sizeof(int32_t));
657 if (inputTexture) inputTexture.Destroy();
658 if (labelTexture) labelTexture.Destroy();
659 if (centroidTexture) centroidTexture.Destroy();
660 readLabelsBuffer.Destroy();
661 readCentroidsBuffer.Destroy();
666 labels.shrink_to_fit();
667#if defined(__EMSCRIPTEN__)
668 emscripten_sleep(50);
Core image processing functions for img2num project.