Skip to content

Commit c1d8e8d

Browse files
Optimize vector quantization step
This change improves the compression speed for both DXT and ETC encodings. Explanation: The vector quantization algorithm takes floating point vectors as input and performs vector preprocessing right before the quantization. At the same time, selector training vectors are generated directly from integer selector values, packed into a single uint64. It would therefore be more efficient to perform preprocessing of the selector training vectors (which includes sorting and deduplication) while still having them in a packed form. Additional performance boost is achieved by using multiple threads for sorting the training vectors. DXT Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8). [Compressing Kodak set without mipmaps using DXT1 encoding] Original: 1582222 bytes / 28.869 sec Modified: 1468204 bytes / 5.477 sec Improvement: 7.21% (compression ratio) / 81.03% (compression time) [Compressing Kodak set with mipmaps using DXT1 encoding] Original: 2065243 bytes / 36.961 sec Modified: 1914805 bytes / 7.322 sec Improvement: 7.28% (compression ratio) / 80.19% (compression time) ETC Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings). [Compressing Kodak set without mipmaps using ETC1 encoding] Total size: 1607858 bytes Total time: 12.766 sec Average bitrate: 1.363 bpp Average Luma PSNR: 34.050 dB
1 parent 21eb70b commit c1d8e8d

File tree

3 files changed

+249
-100
lines changed

3 files changed

+249
-100
lines changed

bin/crunch_x64.exe

9.5 KB
Binary file not shown.

crnlib/crn_dxt_hc.cpp

Lines changed: 223 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,7 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
530530
uint b = blocks[i];
531531
uint weight = (uint)(math::clamp<uint>(endpoint_weight * m_block_weights[b], 1, 2048) * encoding_weight[m_block_encodings[b]]);
532532
uint32 selector = 0;
533-
for (uint sh = 0, p = 0; p < 16; p++, sh += 2) {
533+
for (uint p = 0; p < 16; p++) {
534534
uint error_best = cUINT32_MAX;
535535
uint8 s_best = 0;
536536
for (uint8 t = 0; t < 4; t++) {
@@ -541,9 +541,9 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
541541
error_best = error;
542542
}
543543
}
544-
selector |= s_best << sh;
544+
selector = selector << 2 | s_best;
545545
}
546-
m_block_selectors[cColor][b] = selector | (uint64)weight << 32;
546+
m_block_selectors[cColor][b] = (uint64)selector << 32 | weight;
547547
}
548548

549549
dxt_endpoint_refiner::params refinerParams;
@@ -609,7 +609,7 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) {
609609
uint b = blocks[i];
610610
uint weight = (uint)(math::clamp<uint>(0x8000 * endpoint_weight * m_block_weights[b] * (m_block_encodings[b] ? 0.972f : 1.0f), 1, 0xFFFF));
611611
uint32 selector = 0;
612-
for (uint sh = 0, p = 0; p < 8; p++, sh += 2) {
612+
for (uint p = 0; p < 8; p++) {
613613
uint error_best = cUINT32_MAX;
614614
uint8 s_best = 0;
615615
for (uint8 s = 0; s < 4; s++) {
@@ -619,9 +619,9 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) {
619619
error_best = error;
620620
}
621621
}
622-
selector |= s_best << sh;
622+
selector = selector << 2 | s_best;
623623
}
624-
m_block_selectors[cColor][b] = selector | (uint64)weight << 32;
624+
m_block_selectors[cColor][b] = (uint64)selector << ((b & 1) ? 32 : 48) | weight;
625625
}
626626
}
627627
}
@@ -663,13 +663,59 @@ void dxt_hc::determine_color_endpoint_clusters_task(uint64 data, void* pData_ptr
663663
}
664664

665665
void dxt_hc::determine_color_endpoints() {
666-
tree_clusterizer<vec6F> vq;
666+
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
667+
crnlib::vector<std::pair<vec6F, uint> > endpoints;
667668
for (uint t = 0; t < m_tiles.size(); t++) {
668669
if (m_tiles[t].pixels.size())
669-
vq.add_training_vec(m_tiles[t].color_endpoint, (uint)(m_tiles[t].pixels.size() * m_tiles[t].weight));
670+
endpoints.push_back(std::make_pair(m_tiles[t].color_endpoint, (uint)(m_tiles[t].pixels.size() * m_tiles[t].weight)));
670671
}
671672

672-
vq.generate_codebook(math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size), true, m_pTask_pool);
673+
struct Node {
674+
std::pair<vec6F, uint> *p, *pEnd;
675+
Node (std::pair<vec6F, uint>* begin, std::pair<vec6F, uint>* end) : p(begin), pEnd(end) {}
676+
bool operator<(const Node& other) const { return *p > *other.p; }
677+
static void sort_task(uint64 data, void* ptr) { std::sort(((Node*)ptr)->p, ((Node*)ptr)->pEnd); }
678+
};
679+
680+
crnlib::vector<Node> nodes;
681+
Node node(0, endpoints.get_ptr());
682+
for (uint i = 0; i < num_tasks; i++) {
683+
node.p = node.pEnd;
684+
node.pEnd = endpoints.get_ptr() + endpoints.size() * (i + 1) / num_tasks;
685+
if (node.p != node.pEnd)
686+
nodes.push_back(node);
687+
}
688+
689+
for (uint i = 0; i < nodes.size(); i++)
690+
m_pTask_pool->queue_task(&Node::sort_task, i, &nodes[i]);
691+
m_pTask_pool->join();
692+
693+
std::priority_queue<Node> queue;
694+
for (uint i = 0; i < nodes.size(); i++)
695+
queue.push(nodes[i]);
696+
697+
crnlib::vector<vec6F> vectors;
698+
crnlib::vector<uint> weights;
699+
vectors.reserve(endpoints.size());
700+
weights.reserve(endpoints.size());
701+
while (queue.size()) {
702+
Node node = queue.top();
703+
std::pair<vec6F, uint>* endpoint = node.p++;
704+
queue.pop();
705+
if (node.p != node.pEnd)
706+
queue.push(node);
707+
if (!vectors.size() || endpoint->first != vectors.back()) {
708+
vectors.push_back(endpoint->first);
709+
weights.push_back(endpoint->second);
710+
} else if (weights.back() > UINT_MAX - endpoint->second) {
711+
weights.back() = UINT_MAX;
712+
} else {
713+
weights.back() += endpoint->second;
714+
}
715+
}
716+
717+
tree_clusterizer<vec6F> vq;
718+
vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size), true, m_pTask_pool);
673719
m_color_clusters.resize(vq.get_codebook_size());
674720

675721
for (uint i = 0; i <= m_pTask_pool->get_num_threads(); i++)
@@ -757,7 +803,7 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
757803
uint b = blocks[i];
758804
uint weight = encoding_weight[m_block_encodings[b]];
759805
uint64 selector = 0;
760-
for (uint sh = 0, p = 0; p < 16; p++, sh += 3) {
806+
for (uint p = 0; p < 16; p++) {
761807
uint error_best = cUINT32_MAX;
762808
uint8 s_best = 0;
763809
for (uint8 t = 0; t < 8; t++) {
@@ -769,9 +815,9 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
769815
error_best = error;
770816
}
771817
}
772-
selector |= (uint64)s_best << sh;
818+
selector = selector << 3 | s_best;
773819
}
774-
m_block_selectors[cAlpha0 + a][b] = selector | (uint64)weight << 48;
820+
m_block_selectors[cAlpha0 + a][b] = selector << 16 | weight;
775821
}
776822
}
777823

@@ -823,18 +869,64 @@ void dxt_hc::determine_alpha_endpoint_clusters_task(uint64 data, void* pData_ptr
823869
}
824870

825871
void dxt_hc::determine_alpha_endpoints() {
826-
tree_clusterizer<vec2F> vq;
872+
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
873+
crnlib::vector<std::pair<vec2F, uint> > endpoints;
827874
for (uint a = 0; a < m_num_alpha_blocks; a++) {
828875
for (uint t = 0; t < m_tiles.size(); t++) {
829876
if (m_tiles[t].pixels.size())
830-
vq.add_training_vec(m_tiles[t].alpha_endpoints[a], m_tiles[t].pixels.size());
877+
endpoints.push_back(std::make_pair(m_tiles[t].alpha_endpoints[a], m_tiles[t].pixels.size()));
831878
}
832879
}
833880

834-
vq.generate_codebook(math::minimum<uint>(m_num_tiles, m_params.m_alpha_endpoint_codebook_size), false, m_pTask_pool);
881+
struct Node {
882+
std::pair<vec2F, uint> *p, *pEnd;
883+
Node (std::pair<vec2F, uint>* begin, std::pair<vec2F, uint>* end) : p(begin), pEnd(end) {}
884+
bool operator<(const Node& other) const { return *p > *other.p; }
885+
static void sort_task(uint64 data, void* ptr) { std::sort(((Node*)ptr)->p, ((Node*)ptr)->pEnd); }
886+
};
887+
888+
crnlib::vector<Node> nodes;
889+
Node node(0, endpoints.get_ptr());
890+
for (uint i = 0; i < num_tasks; i++) {
891+
node.p = node.pEnd;
892+
node.pEnd = endpoints.get_ptr() + endpoints.size() * (i + 1) / num_tasks;
893+
if (node.p != node.pEnd)
894+
nodes.push_back(node);
895+
}
896+
897+
for (uint i = 0; i < nodes.size(); i++)
898+
m_pTask_pool->queue_task(&Node::sort_task, i, &nodes[i]);
899+
m_pTask_pool->join();
900+
901+
std::priority_queue<Node> queue;
902+
for (uint i = 0; i < nodes.size(); i++)
903+
queue.push(nodes[i]);
904+
905+
crnlib::vector<vec2F> vectors;
906+
crnlib::vector<uint> weights;
907+
vectors.reserve(endpoints.size());
908+
weights.reserve(endpoints.size());
909+
while (queue.size()) {
910+
Node node = queue.top();
911+
std::pair<vec2F, uint>* endpoint = node.p++;
912+
queue.pop();
913+
if (node.p != node.pEnd)
914+
queue.push(node);
915+
if (!vectors.size() || endpoint->first != vectors.back()) {
916+
vectors.push_back(endpoint->first);
917+
weights.push_back(endpoint->second);
918+
} else if (weights.back() > UINT_MAX - endpoint->second) {
919+
weights.back() = UINT_MAX;
920+
} else {
921+
weights.back() += endpoint->second;
922+
}
923+
}
924+
925+
tree_clusterizer<vec2F> vq;
926+
vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), math::minimum<uint>(m_num_tiles, m_params.m_alpha_endpoint_codebook_size), false, m_pTask_pool);
835927
m_alpha_clusters.resize(vq.get_codebook_size());
836928

837-
for (uint i = 0; i <= m_pTask_pool->get_num_threads(); i++)
929+
for (uint i = 0; i < num_tasks; i++)
838930
m_pTask_pool->queue_object_task(this, &dxt_hc::determine_alpha_endpoint_clusters_task, i, &vq);
839931
m_pTask_pool->join();
840932

@@ -859,7 +951,7 @@ void dxt_hc::determine_alpha_endpoints() {
859951
}
860952
}
861953

862-
for (uint i = 0; i <= m_pTask_pool->get_num_threads(); i++)
954+
for (uint i = 0; i < num_tasks; i++)
863955
m_pTask_pool->queue_object_task(this, &dxt_hc::determine_alpha_endpoint_codebook_task, i, NULL);
864956
m_pTask_pool->join();
865957
}
@@ -911,16 +1003,68 @@ void dxt_hc::create_color_selector_codebook_task(uint64 data, void* pData_ptr) {
9111003
}
9121004
}
9131005

1006+
struct SelectorNode {
1007+
uint64 *p, *pEnd;
1008+
SelectorNode (uint64* begin, uint64* end) : p(begin), pEnd(end) {}
1009+
bool operator<(const SelectorNode& other) const { return *p > *other.p; }
1010+
static void sort_task(uint64 data, void* ptr) { std::sort(((SelectorNode*)ptr)->p, ((SelectorNode*)ptr)->pEnd); }
1011+
};
1012+
9141013
void dxt_hc::create_color_selector_codebook() {
915-
tree_clusterizer<vec16F> selector_vq;
916-
vec16F v;
917-
for (uint n = m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks, b = 0; b < n; b++) {
918-
uint64 selector = m_has_etc_color_blocks ? m_block_selectors[cColor][b << 1] | m_block_selectors[cColor][b << 1 | 1] << 16 : m_block_selectors[cColor][b];
919-
for (uint8 p = 0; p < 16; p++, selector >>= 2)
920-
v[p] = ((selector & 3) + 0.5f) * 0.25f;
921-
selector_vq.add_training_vec(v, m_has_etc_color_blocks ? (selector & 0xFFFF) + (selector >> 16) : selector);
1014+
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
1015+
crnlib::vector<uint64> selectors(m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks);
1016+
for (uint i = 0, b = 0, step = m_has_etc_color_blocks ? 2 : 1; b < m_num_blocks; b += step)
1017+
selectors[i++] = m_block_selectors[cColor][b] + (m_has_etc_color_blocks ? m_block_selectors[cColor][b + 1] : 0);
1018+
1019+
crnlib::vector<SelectorNode> nodes;
1020+
SelectorNode node(0, selectors.get_ptr());
1021+
for (uint i = 0; i < num_tasks; i++) {
1022+
node.p = node.pEnd;
1023+
node.pEnd = selectors.get_ptr() + selectors.size() * (i + 1) / num_tasks;
1024+
if (node.p != node.pEnd)
1025+
nodes.push_back(node);
9221026
}
923-
selector_vq.generate_codebook(m_params.m_color_selector_codebook_size, false, m_pTask_pool);
1027+
1028+
for (uint i = 0; i < nodes.size(); i++)
1029+
m_pTask_pool->queue_task(&SelectorNode::sort_task, i, &nodes[i]);
1030+
m_pTask_pool->join();
1031+
1032+
std::priority_queue<SelectorNode> queue;
1033+
for (uint i = 0; i < nodes.size(); i++)
1034+
queue.push(nodes[i]);
1035+
1036+
float v[4];
1037+
for (uint s = 0; s < 4; s++)
1038+
v[s] = (s + 0.5f) * 0.25f;
1039+
1040+
crnlib::vector<vec16F> vectors;
1041+
crnlib::vector<uint> weights;
1042+
vectors.reserve(selectors.size());
1043+
weights.reserve(selectors.size());
1044+
for (uint64 prev_selector = 0; queue.size();) {
1045+
SelectorNode node = queue.top();
1046+
uint64 selector = *node.p++;
1047+
queue.pop();
1048+
if (node.p != node.pEnd)
1049+
queue.push(node);
1050+
uint weight = (uint)selector;
1051+
selector >>= 32;
1052+
if (!vectors.size() || selector != prev_selector) {
1053+
prev_selector = selector;
1054+
vec16F vector;
1055+
for (uint p = 0; p < 16; p++, selector >>= 2)
1056+
vector[15 - p] = v[selector & 3];
1057+
vectors.push_back(vector);
1058+
weights.push_back(weight);
1059+
} else if (weights.back() > UINT_MAX - weight) {
1060+
weights.back() = UINT_MAX;
1061+
} else {
1062+
weights.back() += weight;
1063+
}
1064+
}
1065+
1066+
tree_clusterizer<vec16F> selector_vq;
1067+
selector_vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), m_params.m_color_selector_codebook_size, false, m_pTask_pool);
9241068
m_color_selectors.resize(selector_vq.get_codebook_size());
9251069
m_color_selectors_used.resize(selector_vq.get_codebook_size());
9261070
for (uint i = 0; i < selector_vq.get_codebook_size(); i++) {
@@ -930,7 +1074,6 @@ void dxt_hc::create_color_selector_codebook() {
9301074
m_color_selectors[i] |= (uint)(v[j] * 4.0f) << sh;
9311075
}
9321076

933-
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
9341077
crnlib::vector<crnlib::vector<color_selector_details> > selector_details(num_tasks);
9351078
for (uint t = 0; t < num_tasks; t++) {
9361079
selector_details[t].resize(m_color_selectors.size());
@@ -1024,17 +1167,62 @@ void dxt_hc::create_alpha_selector_codebook_task(uint64 data, void* pData_ptr) {
10241167
}
10251168

10261169
void dxt_hc::create_alpha_selector_codebook() {
1027-
tree_clusterizer<vec16F> selector_vq;
1028-
vec16F v;
1029-
for (uint c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) {
1030-
for (uint b = 0; b < m_num_blocks; b += m_has_etc_color_blocks ? 2 : 1) {
1031-
uint64 selector = m_block_selectors[c][b];
1032-
for (uint8 p = 0; p < 16; p++, selector >>= 3)
1033-
v[p] = ((selector & 7) + 0.5f) * 0.125f;
1034-
selector_vq.add_training_vec(v, selector);
1170+
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
1171+
crnlib::vector<uint64> selectors(m_num_alpha_blocks * (m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks));
1172+
for (uint i = 0, c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) {
1173+
for (uint b = 0, step = m_has_etc_color_blocks ? 2 : 1; b < m_num_blocks; b += step)
1174+
selectors[i++] = m_block_selectors[c][b];
1175+
}
1176+
1177+
crnlib::vector<SelectorNode> nodes;
1178+
SelectorNode node(0, selectors.get_ptr());
1179+
for (uint i = 0; i < num_tasks; i++) {
1180+
node.p = node.pEnd;
1181+
node.pEnd = selectors.get_ptr() + selectors.size() * (i + 1) / num_tasks;
1182+
if (node.p != node.pEnd)
1183+
nodes.push_back(node);
1184+
}
1185+
1186+
for (uint i = 0; i < nodes.size(); i++)
1187+
m_pTask_pool->queue_task(&SelectorNode::sort_task, i, &nodes[i]);
1188+
m_pTask_pool->join();
1189+
1190+
std::priority_queue<SelectorNode> queue;
1191+
for (uint i = 0; i < nodes.size(); i++)
1192+
queue.push(nodes[i]);
1193+
1194+
float v[8];
1195+
for (uint s = 0; s < 8; s++)
1196+
v[s] = (s + 0.5f) * 0.125f;
1197+
1198+
crnlib::vector<vec16F> vectors;
1199+
crnlib::vector<uint> weights;
1200+
vectors.reserve(selectors.size());
1201+
weights.reserve(selectors.size());
1202+
for (uint64 prev_selector = 0; queue.size();) {
1203+
SelectorNode node = queue.top();
1204+
uint64 selector = *node.p++;
1205+
queue.pop();
1206+
if (node.p != node.pEnd)
1207+
queue.push(node);
1208+
uint weight = (uint16)selector;
1209+
selector >>= 16;
1210+
if (!vectors.size() || selector != prev_selector) {
1211+
prev_selector = selector;
1212+
vec16F vector;
1213+
for (uint p = 0; p < 16; p++, selector >>= 3)
1214+
vector[15 - p] = v[selector & 7];
1215+
vectors.push_back(vector);
1216+
weights.push_back(weight);
1217+
} else if (weights.back() > UINT_MAX - weight) {
1218+
weights.back() = UINT_MAX;
1219+
} else {
1220+
weights.back() += weight;
10351221
}
10361222
}
1037-
selector_vq.generate_codebook(m_params.m_alpha_selector_codebook_size, false, m_pTask_pool);
1223+
1224+
tree_clusterizer<vec16F> selector_vq;
1225+
selector_vq.generate_codebook(vectors.get_ptr(), weights.get_ptr(), vectors.size(), m_params.m_alpha_selector_codebook_size, false, m_pTask_pool);
10381226
m_alpha_selectors.resize(selector_vq.get_codebook_size());
10391227
m_alpha_selectors_used.resize(selector_vq.get_codebook_size());
10401228
for (uint i = 0; i < selector_vq.get_codebook_size(); i++) {
@@ -1044,7 +1232,6 @@ void dxt_hc::create_alpha_selector_codebook() {
10441232
m_alpha_selectors[i] |= (uint64)(v[j] * 8.0f) << sh;
10451233
}
10461234

1047-
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
10481235
crnlib::vector<crnlib::vector<alpha_selector_details> > selector_details(num_tasks);
10491236
for (uint t = 0; t < num_tasks; t++) {
10501237
selector_details[t].resize(m_alpha_selectors.size());

0 commit comments

Comments
 (0)