2#include "internal/kmeans_gpu.h"
5#include "internal/cielab.h"
6#include "internal/gpu.h"
7#include "internal/Image.h"
8#include "internal/LABAPixel.h"
9#include "internal/PixelConverters.h"
10#include "internal/RGBAPixel.h"
26static constexpr uint8_t COLOR_SPACE_OPTION_CIELAB {0};
27static constexpr uint8_t COLOR_SPACE_OPTION_RGB {1};
34 uint32_t numCentroids;
38__attribute__((packed))
55__attribute__((packed))
71__attribute__((packed))
79template <
typename PixelT>
80void kMeansPlusPlusInitGpu(
82 const uint8_t color_space
87 size_t width = pixels.getWidth();
88 size_t height = pixels.getHeight();
89 size_t num_pixels = width * height;
91 std::vector<PixelT> centroids;
96 wgpu::TextureDescriptor texDesc = {};
97 texDesc.size = {
static_cast<uint32_t
>(width),
static_cast<uint32_t
>(height), 1};
98 texDesc.format = wgpu::TextureFormat::RGBA32Float;
99 texDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::CopyDst;
100 texDesc.label =
"inputTextureInit";
101 wgpu::Texture inputTexture = GPU::getClassInstance().get_device().CreateTexture(&texDesc);
104 std::vector<float> gpu_pixels;
105 gpu_pixels.reserve(num_pixels * 4);
107 for (
int i = 0; i < num_pixels; i++) {
108 PixelT p = pixels[i];
109 if constexpr (std::is_same_v<PixelT, ImageLib::LABAPixel<float>>) {
110 gpu_pixels.push_back(p.l / 255.0f);
111 gpu_pixels.push_back(p.a / 255.0f);
112 gpu_pixels.push_back(p.b / 255.0f);
113 gpu_pixels.push_back(p.alpha / 255.0f);
115 gpu_pixels.push_back(p.red / 255.0f);
116 gpu_pixels.push_back(p.green / 255.0f);
117 gpu_pixels.push_back(p.blue / 255.0f);
118 gpu_pixels.push_back(p.alpha / 255.0f);
122 wgpu::TexelCopyTextureInfo texDst = {};
123 texDst.texture = inputTexture;
124 wgpu::TexelCopyBufferLayout texLayout = {};
125 texLayout.bytesPerRow = width * 16;
126 texLayout.rowsPerImage = height;
127 GPU::getClassInstance().get_queue().WriteTexture(
128 &texDst, gpu_pixels.data(), gpu_pixels.size() * 4, &texLayout, &texDesc.size
133 std::vector<float> initial_dists(num_pixels, std::numeric_limits<float>::max());
135 wgpu::BufferDescriptor distDesc = {};
136 distDesc.size = num_pixels *
sizeof(float);
138 wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
139 wgpu::Buffer minDistBuffer = GPU::getClassInstance().get_device().CreateBuffer(&distDesc);
140 GPU::getClassInstance().get_queue().WriteBuffer(
141 minDistBuffer, 0, initial_dists.data(), distDesc.size
146 wgpu::BufferDescriptor uniDesc = {};
148 uniDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;
149 wgpu::Buffer paramBuffer = GPU::getClassInstance().get_device().CreateBuffer(&uniDesc);
152 wgpu::BufferDescriptor readDesc = {};
153 readDesc.size = num_pixels *
sizeof(float);
154 readDesc.usage = wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst;
155 wgpu::Buffer readBuffer = GPU::getClassInstance().get_device().CreateBuffer(&readDesc);
158 wgpu::ComputePipeline pipeline =
159 GPU::getClassInstance().createPipeline(
"dist_shader",
"updateDistShader");
162 wgpu::BindGroupEntry entries[3];
163 entries[0].binding = 0;
164 entries[0].textureView = inputTexture.CreateView();
165 entries[1].binding = 1;
166 entries[1].buffer = minDistBuffer;
167 entries[1].size = distDesc.size;
168 entries[2].binding = 2;
169 entries[2].buffer = paramBuffer;
170 entries[2].size = uniDesc.size;
172 wgpu::BindGroupDescriptor bgDesc = {};
173 bgDesc.layout = pipeline.GetBindGroupLayout(0);
174 bgDesc.entryCount = 3;
175 bgDesc.entries = entries;
176 wgpu::BindGroup bindGroup = GPU::getClassInstance().get_device().CreateBindGroup(&bgDesc);
180 std::random_device rd;
181 std::mt19937 gen(rd());
184 std::uniform_int_distribution<> dis(0, num_pixels - 1);
185 int first_index = dis(gen);
186 centroids.push_back(pixels[first_index]);
190 bool* done =
new bool(
false);
192 for (
int i = 1; i < k; ++i) {
195 PixelT c = centroids.back();
197 if constexpr (std::is_same_v<PixelT, ImageLib::LABAPixel<float>>) {
199 c.l / 255.0f, c.a / 255.0f, c.b / 255.0f, 1.0f,
static_cast<uint32_t
>(width)};
202 c.red / 255.0f, c.green / 255.0f, c.blue / 255.0f, 1.0f,
203 static_cast<uint32_t
>(width)};
206 GPU::getClassInstance().get_queue().WriteBuffer(
211 wgpu::CommandEncoder encoder = GPU::getClassInstance().get_device().CreateCommandEncoder();
212 wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
213 pass.SetPipeline(pipeline);
214 pass.SetBindGroup(0, bindGroup);
215 pass.DispatchWorkgroups((width + 15) / 16, (height + 15) / 16, 1);
219 encoder.CopyBufferToBuffer(minDistBuffer, 0, readBuffer, 0, readDesc.size);
220 wgpu::CommandBuffer commands = encoder.Finish();
221 GPU::getClassInstance().get_queue().Submit(1, &commands);
226 wgpu::MapMode::Read, 0, readDesc.size, wgpu::CallbackMode::AllowProcessEvents,
227 [](wgpu::MapAsyncStatus status, wgpu::StringView msg,
void* userdata) {
228 bool* flag = static_cast<bool*>(userdata);
229 bool success = false;
230 if (status == wgpu::MapAsyncStatus::Success) {
245 GPU::getClassInstance().get_instance().ProcessEvents();
246#if defined(__EMSCRIPTEN__)
247 emscripten_sleep(10);
251 const float* dists = (
const float*)readBuffer.GetConstMappedRange();
253 double sum_dist_sq = 0.0;
257 for (
size_t j = 0; j < num_pixels; ++j) {
258 sum_dist_sq += dists[j];
262 std::uniform_real_distribution<> dist_selector(0.0, sum_dist_sq);
263 double random_value = dist_selector(gen);
264 double current_sum = 0.0;
265 int selected_index = -1;
267 for (
size_t j = 0; j < num_pixels; ++j) {
268 current_sum += dists[j];
269 if (current_sum >= random_value) {
275 if (selected_index == -1)
276 selected_index = num_pixels - 1;
279 centroids.push_back(pixels[selected_index]);
281#if defined(__EMSCRIPTEN__)
282 emscripten_sleep(10);
286 std::copy(centroids.begin(), centroids.end(), out_centroids.begin());
290 inputTexture.Destroy();
291 readBuffer.Destroy();
292 minDistBuffer.Destroy();
293 paramBuffer.Destroy();
296#if defined(__EMSCRIPTEN__)
297 emscripten_sleep(50);
306 const int32_t height,
const int32_t k, wgpu::Texture& inputTexture, wgpu::Texture& labelTexture,
307 wgpu::Texture& centroidTexture, wgpu::TextureDescriptor& labelDesc,
308 wgpu::TextureDescriptor& centroidDesc, wgpu::ComputePipeline& pipeline1,
309 wgpu::ComputePipeline& pipeline2, wgpu::BindGroup& bindGroup1, wgpu::BindGroup& bindGroup2,
310 const uint8_t color_space
312 int bytesPerPixel {16};
313 const int32_t num_pixels {pixels.getSize()};
315 wgpu::TextureDescriptor texDesc = {};
316 texDesc.size = {
static_cast<uint32_t
>(width),
static_cast<uint32_t
>(height), 1};
317 texDesc.format = wgpu::TextureFormat::RGBA32Float;
318 texDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::CopyDst;
319 texDesc.label =
"inputTexture";
320 inputTexture = GPU::getClassInstance().get_device().CreateTexture(&texDesc);
322 wgpu::TexelCopyTextureInfo dst = {};
323 dst.texture = inputTexture;
324 wgpu::TexelCopyBufferLayout layout = {};
326 layout.bytesPerRow = width * bytesPerPixel;
327 layout.rowsPerImage = height;
329 std::vector<float> pixels_;
330 for (
int i = 0; i < num_pixels; i++) {
331 switch (color_space) {
332 case COLOR_SPACE_OPTION_RGB: {
334 pixels_.push_back(p.red / 255.0f);
335 pixels_.push_back(p.green / 255.0f);
336 pixels_.push_back(p.blue / 255.0f);
337 pixels_.push_back(p.alpha / 255.0f);
340 case COLOR_SPACE_OPTION_CIELAB: {
342 pixels_.push_back(p.l / 255.0f);
343 pixels_.push_back(p.a / 255.0f);
344 pixels_.push_back(p.b / 255.0f);
345 pixels_.push_back(p.alpha / 255.0f);
351 GPU::getClassInstance().get_queue().WriteTexture(
352 &dst, pixels_.data(), pixels_.size() *
sizeof(
float), &layout, &texDesc.size
356 centroidDesc.size = {
static_cast<uint32_t
>(k), 1, 1};
357 centroidDesc.format = wgpu::TextureFormat::RGBA32Float;
358 centroidDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::StorageBinding |
359 wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::CopySrc;
360 centroidDesc.label =
"centroidTexture";
361 centroidTexture = GPU::getClassInstance().get_device().CreateTexture(¢roidDesc);
363 wgpu::TexelCopyTextureInfo cdst = {};
364 cdst.texture = centroidTexture;
365 wgpu::TexelCopyBufferLayout clayout = {};
367 clayout.bytesPerRow = k * bytesPerPixel;
368 clayout.rowsPerImage = 1;
370 std::vector<float> centroids_;
371 switch (color_space) {
372 case COLOR_SPACE_OPTION_RGB: {
373 for (
int i = 0; i < k; i++) {
374 auto p = centroids[i];
375 centroids_.push_back(p.red / 255.0f);
376 centroids_.push_back(p.green / 255.0f);
377 centroids_.push_back(p.blue / 255.0f);
378 centroids_.push_back(p.alpha / 255.0f);
382 case COLOR_SPACE_OPTION_CIELAB: {
383 for (
int i = 0; i < k; i++) {
384 auto p = centroids_lab[i];
385 centroids_.push_back(p.l / 255.0f);
386 centroids_.push_back(p.a / 255.0f);
387 centroids_.push_back(p.b / 255.0f);
388 centroids_.push_back(p.alpha / 255.0f);
394 GPU::getClassInstance().get_queue().WriteTexture(
395 &cdst, centroids_.data(), centroids_.size() *
sizeof(
float), &clayout, ¢roidDesc.size
399 labelDesc.size = {
static_cast<uint32_t
>(width),
static_cast<uint32_t
>(height), 1};
400 labelDesc.format = wgpu::TextureFormat::RGBA32Uint;
401 labelDesc.usage = wgpu::TextureUsage::TextureBinding | wgpu::TextureUsage::StorageBinding |
402 wgpu::TextureUsage::CopyDst | wgpu::TextureUsage::CopySrc;
403 labelDesc.label =
"labelTexture";
404 labelTexture = GPU::getClassInstance().get_device().CreateTexture(&labelDesc);
407 Params params = {
static_cast<uint32_t
>(num_pixels),
static_cast<uint32_t
>(k)};
408 wgpu::BufferDescriptor bufDesc = {};
409 bufDesc.size =
sizeof(
Params);
410 bufDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;
411 wgpu::Buffer paramBuffer = GPU::getClassInstance().get_device().CreateBuffer(&bufDesc);
412 GPU::getClassInstance().get_queue().WriteBuffer(paramBuffer, 0, ¶ms,
sizeof(
Params));
415 std::vector<ClusterAccumulator> reset_centroids(k, {0, 0, 0, 0});
416 wgpu::BufferDescriptor accDesc = {};
418 accDesc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopyDst;
419 wgpu::Buffer accBuffer = GPU::getClassInstance().get_device().CreateBuffer(&accDesc);
420 GPU::getClassInstance().get_queue().WriteBuffer(
421 accBuffer, 0, reset_centroids.data(), accDesc.size
426 GPU::getClassInstance().createPipeline(
"assign_update_shader",
"assignUpdateShader");
427 pipeline2 = GPU::getClassInstance().createPipeline(
"resolve_shader",
"resolveShader");
430 wgpu::BindGroupDescriptor bindGroupDesc1 = {};
431 bindGroupDesc1.layout = pipeline1.GetBindGroupLayout(0);
432 wgpu::BindGroupEntry entries1[5];
434 entries1[0].binding = 0;
435 entries1[0].textureView = inputTexture.CreateView();
437 entries1[1].binding = 1;
438 entries1[1].textureView = centroidTexture.CreateView();
440 entries1[2].binding = 2;
441 entries1[2].textureView = labelTexture.CreateView();
443 entries1[3].binding = 3;
444 entries1[3].buffer = paramBuffer;
445 entries1[3].size =
sizeof(
Params);
447 entries1[4].binding = 4;
448 entries1[4].buffer = accBuffer;
451 bindGroupDesc1.entryCount = 5;
452 bindGroupDesc1.entries = entries1;
453 bindGroup1 = GPU::getClassInstance().get_device().CreateBindGroup(&bindGroupDesc1);
455 wgpu::BindGroupDescriptor bindGroupDesc2 = {};
456 bindGroupDesc2.layout = pipeline2.GetBindGroupLayout(0);
457 wgpu::BindGroupEntry entries2[2];
458 entries2[0].binding = 0;
459 entries2[0].buffer = accBuffer;
460 entries2[0].size = accDesc.size;
461 entries2[1].binding = 1;
462 entries2[1].textureView = centroidTexture.CreateView();
463 bindGroupDesc2.entryCount = 2;
464 bindGroupDesc2.entries = entries2;
465 bindGroup2 = GPU::getClassInstance().get_device().CreateBindGroup(&bindGroupDesc2);
469 const uint8_t* data, uint8_t* out_data, int32_t* out_labels,
const int32_t width,
470 const int32_t height,
const int32_t k,
const int32_t max_iter,
const uint8_t color_space
473 pixels.loadFromBuffer(data, width, height, ImageLib::RGBA_CONVERTER<float>);
474 const int32_t num_pixels {pixels.getSize()};
481 std::vector<int32_t> labels(num_pixels, -1);
485 if (color_space == COLOR_SPACE_OPTION_CIELAB) {
486 for (
int i {0}; i < pixels.getSize(); ++i) {
487 rgb_to_lab<float, float>(pixels[i], lab[i]);
491 std::cout <<
"starting" << std::endl;
494 switch (color_space) {
495 case COLOR_SPACE_OPTION_RGB: {
496 kMeansPlusPlusInitGpu<ImageLib::RGBAPixel<float>>(pixels, centroids, k, color_space);
499 case COLOR_SPACE_OPTION_CIELAB: {
500 kMeansPlusPlusInitGpu<ImageLib::LABAPixel<float>>(lab, centroids_lab, k, color_space);
504 std::cout <<
"kmeans++ init done" << std::endl;
507 int bytesPerPixel {16};
512 wgpu::ComputePipeline pipeline1;
513 wgpu::ComputePipeline pipeline2;
514 wgpu::BindGroup bindGroup1;
515 wgpu::BindGroup bindGroup2;
516 wgpu::Texture inputTexture;
517 wgpu::Texture labelTexture;
518 wgpu::Texture centroidTexture;
519 wgpu::TextureDescriptor labelDesc = {};
520 wgpu::TextureDescriptor centroidDesc = {};
524 pixels, lab, centroids, centroids_lab, width, height, k, inputTexture, labelTexture,
525 centroidTexture, labelDesc, centroidDesc, pipeline1, pipeline2, bindGroup1, bindGroup2,
529 uint32_t wgX = (width + 15) / 16;
530 uint32_t wgY = (height + 15) / 16;
533 uint32_t bytesPerRowLabels =
534 GPU::getAlignedBytesPerRow(width,
static_cast<uint32_t
>(bytesPerPixel));
535 wgpu::BufferDescriptor readLabelsDesc = {};
536 readLabelsDesc.size = bytesPerRowLabels * height;
537 readLabelsDesc.usage = wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst;
538 wgpu::Buffer readLabelsBuffer =
539 GPU::getClassInstance().get_device().CreateBuffer(&readLabelsDesc);
542 uint32_t bytesPerRowCentroids =
543 GPU::getAlignedBytesPerRow(width,
static_cast<uint32_t
>(bytesPerPixel));
544 wgpu::BufferDescriptor readCentroidsDesc = {};
545 readCentroidsDesc.size = bytesPerRowCentroids;
546 readCentroidsDesc.usage = wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst;
547 wgpu::Buffer readCentroidsBuffer =
548 GPU::getClassInstance().get_device().CreateBuffer(&readCentroidsDesc);
551 std::cout <<
"start iterations" << std::endl;
552 wgpu::CommandEncoder encoder = GPU::getClassInstance().get_device().CreateCommandEncoder();
553 for (int32_t iter {0}; iter < max_iter; ++iter) {
554 wgpu::ComputePassEncoder pass1 = encoder.BeginComputePass();
555 pass1.SetPipeline(pipeline1);
556 pass1.SetBindGroup(0, bindGroup1);
557 pass1.DispatchWorkgroups(wgX, wgY);
560 wgpu::ComputePassEncoder pass2 = encoder.BeginComputePass();
561 pass2.SetPipeline(pipeline2);
562 pass2.SetBindGroup(0, bindGroup2);
563 pass2.DispatchWorkgroups((k + 255) / 256, 1);
570 wgpu::TexelCopyTextureInfo srcLabels = {};
571 srcLabels.texture = labelTexture;
572 wgpu::TexelCopyBufferInfo dstLabels = {};
573 dstLabels.buffer = readLabelsBuffer;
574 dstLabels.layout.bytesPerRow = bytesPerRowLabels;
575 dstLabels.layout.rowsPerImage = height;
576 encoder.CopyTextureToBuffer(&srcLabels, &dstLabels, &labelDesc.size);
579 wgpu::TexelCopyTextureInfo srcCentroids = {};
580 srcCentroids.texture = centroidTexture;
581 wgpu::TexelCopyBufferInfo dstCentroids = {};
582 dstCentroids.buffer = readCentroidsBuffer;
583 dstCentroids.layout.bytesPerRow = bytesPerRowCentroids;
584 dstCentroids.layout.rowsPerImage = 1;
585 encoder.CopyTextureToBuffer(&srcCentroids, &dstCentroids, ¢roidDesc.size);
587 wgpu::CommandBuffer commands = encoder.Finish();
588 GPU::getClassInstance().get_queue().Submit(1, &commands);
589 std::cout <<
"done iterations" << std::endl;
592 bool* done1 =
new bool(
false);
593 bool* done2 =
new bool(
false);
596 readLabelsBuffer.MapAsync(
597 wgpu::MapMode::Read, 0, readLabelsDesc.size, wgpu::CallbackMode::AllowProcessEvents,
598 [](wgpu::MapAsyncStatus status, wgpu::StringView msg,
void* userdata) {
599 bool* flag = static_cast<bool*>(userdata);
600 bool success = false;
601 if (status == wgpu::MapAsyncStatus::Success) {
610 std::cout <<
"read out" << std::endl;
613 GPU::getClassInstance().get_instance().ProcessEvents();
614#if defined(__EMSCRIPTEN__)
615 emscripten_sleep(10);
619 std::cout <<
"mapping labels" << std::endl;
620 const uint8_t* mappedData = (
const uint8_t*)readLabelsBuffer.GetConstMappedRange();
623 for (
size_t y = 0; y < height; ++y) {
624 const uint8_t* rowPtr = mappedData + (y * bytesPerRowLabels);
625 for (
size_t x = 0; x < width; ++x) {
626 const uint8_t* pixelPtr = rowPtr + (x * bytesPerPixel);
628 std::memcpy(&r, pixelPtr,
sizeof(uint32_t));
630 size_t dstIndex = y * width + x;
631 labels[dstIndex] =
static_cast<int32_t
>(r);
635 readLabelsBuffer.Unmap();
638 readCentroidsBuffer.MapAsync(
639 wgpu::MapMode::Read, 0, readCentroidsDesc.size, wgpu::CallbackMode::AllowProcessEvents,
640 [](wgpu::MapAsyncStatus status, wgpu::StringView msg,
void* userdata) {
641 bool* flag =
static_cast<bool*
>(userdata);
642 bool success =
false;
643 if (status == wgpu::MapAsyncStatus::Success) {
653 GPU::getClassInstance().get_instance().ProcessEvents();
654#if defined(__EMSCRIPTEN__)
655 emscripten_sleep(10);
659 std::cout <<
"mapping centroids" << std::endl;
660 const float* mappedDataFloat = (
const float*)readCentroidsBuffer.GetConstMappedRange();
663 for (
int i = 0; i < k; i++) {
665 const float* centroidPtr = mappedDataFloat + (i * 4);
667 float r = *(centroidPtr);
668 float g = *(centroidPtr + 1);
669 float b = *(centroidPtr + 2);
670 float a = *(centroidPtr + 3);
671 switch (color_space) {
672 case COLOR_SPACE_OPTION_RGB: {
676 case COLOR_SPACE_OPTION_CIELAB: {
684 readCentroidsBuffer.Unmap();
687 if (color_space == COLOR_SPACE_OPTION_CIELAB) {
688 for (int32_t i {0}; i < k; ++i) {
689 lab_to_rgb<float, float>(centroids_lab[i], centroids[i]);
693 for (int32_t i = 0; i < num_pixels; ++i) {
694 const int32_t cluster = labels[i];
695 out_data[i * 4 + 0] =
static_cast<uint8_t
>(centroids[cluster].red);
696 out_data[i * 4 + 1] =
static_cast<uint8_t
>(centroids[cluster].green);
697 out_data[i * 4 + 2] =
static_cast<uint8_t
>(centroids[cluster].blue);
698 out_data[i * 4 + 3] = 255;
702 std::cout <<
"copying labels out" << std::endl;
703 std::memcpy(out_labels, labels.data(), labels.size() *
sizeof(int32_t));
706 inputTexture.Destroy();
708 labelTexture.Destroy();
710 centroidTexture.Destroy();
711 readLabelsBuffer.Destroy();
712 readCentroidsBuffer.Destroy();
717 labels.shrink_to_fit();
718#if defined(__EMSCRIPTEN__)
719 emscripten_sleep(50);
Core image processing functions for img2num project.