@@ -530,7 +530,7 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
530
530
uint b = blocks[i];
531
531
uint weight = (uint)(math::clamp<uint>(endpoint_weight * m_block_weights[b], 1 , 2048 ) * encoding_weight[m_block_encodings[b]]);
532
532
uint32 selector = 0 ;
533
- for (uint sh = 0 , p = 0 ; p < 16 ; p++, sh += 2 ) {
533
+ for (uint p = 0 ; p < 16 ; p++) {
534
534
uint error_best = cUINT32_MAX;
535
535
uint8 s_best = 0 ;
536
536
for (uint8 t = 0 ; t < 4 ; t++) {
@@ -541,9 +541,9 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
541
541
error_best = error;
542
542
}
543
543
}
544
- selector |= s_best << sh ;
544
+ selector = selector << 2 | s_best ;
545
545
}
546
- m_block_selectors[cColor][b] = selector | (uint64)weight << 32 ;
546
+ m_block_selectors[cColor][b] = (uint64)selector << 32 | weight ;
547
547
}
548
548
549
549
dxt_endpoint_refiner::params refinerParams;
@@ -609,7 +609,7 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) {
609
609
uint b = blocks[i];
610
610
uint weight = (uint)(math::clamp<uint>(0x8000 * endpoint_weight * m_block_weights[b] * (m_block_encodings[b] ? 0 .972f : 1 .0f ), 1 , 0xFFFF ));
611
611
uint32 selector = 0 ;
612
- for (uint sh = 0 , p = 0 ; p < 8 ; p++, sh += 2 ) {
612
+ for (uint p = 0 ; p < 8 ; p++) {
613
613
uint error_best = cUINT32_MAX;
614
614
uint8 s_best = 0 ;
615
615
for (uint8 s = 0 ; s < 4 ; s++) {
@@ -619,9 +619,9 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) {
619
619
error_best = error;
620
620
}
621
621
}
622
- selector |= s_best << sh ;
622
+ selector = selector << 2 | s_best ;
623
623
}
624
- m_block_selectors[cColor][b] = selector | (uint64)weight << 32 ;
624
+ m_block_selectors[cColor][b] = (uint64)selector << ((b & 1 ) ? 32 : 48 ) | weight ;
625
625
}
626
626
}
627
627
}
@@ -663,13 +663,59 @@ void dxt_hc::determine_color_endpoint_clusters_task(uint64 data, void* pData_ptr
663
663
}
664
664
665
665
void dxt_hc::determine_color_endpoints () {
666
- tree_clusterizer<vec6F> vq;
666
+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
667
+ crnlib::vector<std::pair<vec6F, uint> > endpoints;
667
668
for (uint t = 0 ; t < m_tiles.size (); t++) {
668
669
if (m_tiles[t].pixels .size ())
669
- vq. add_training_vec ( m_tiles[t].color_endpoint , (uint)(m_tiles[t].pixels .size () * m_tiles[t].weight ));
670
+ endpoints. push_back ( std::make_pair ( m_tiles[t].color_endpoint , (uint)(m_tiles[t].pixels .size () * m_tiles[t].weight ) ));
670
671
}
671
672
672
- vq.generate_codebook (math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size ), true , m_pTask_pool);
673
+ struct Node {
674
+ std::pair<vec6F, uint> *p, *pEnd;
675
+ Node (std::pair<vec6F, uint>* begin, std::pair<vec6F, uint>* end) : p(begin), pEnd(end) {}
676
+ bool operator <(const Node& other) const { return *p > *other.p ; }
677
+ static void sort_task (uint64 data, void * ptr) { std::sort (((Node*)ptr)->p , ((Node*)ptr)->pEnd ); }
678
+ };
679
+
680
+ crnlib::vector<Node> nodes;
681
+ Node node (0 , endpoints.get_ptr ());
682
+ for (uint i = 0 ; i < num_tasks; i++) {
683
+ node.p = node.pEnd ;
684
+ node.pEnd = endpoints.get_ptr () + endpoints.size () * (i + 1 ) / num_tasks;
685
+ if (node.p != node.pEnd )
686
+ nodes.push_back (node);
687
+ }
688
+
689
+ for (uint i = 0 ; i < nodes.size (); i++)
690
+ m_pTask_pool->queue_task (&Node::sort_task, i, &nodes[i]);
691
+ m_pTask_pool->join ();
692
+
693
+ std::priority_queue<Node> queue;
694
+ for (uint i = 0 ; i < nodes.size (); i++)
695
+ queue.push (nodes[i]);
696
+
697
+ crnlib::vector<vec6F> vectors;
698
+ crnlib::vector<uint> weights;
699
+ vectors.reserve (endpoints.size ());
700
+ weights.reserve (endpoints.size ());
701
+ while (queue.size ()) {
702
+ Node node = queue.top ();
703
+ std::pair<vec6F, uint>* endpoint = node.p ++;
704
+ queue.pop ();
705
+ if (node.p != node.pEnd )
706
+ queue.push (node);
707
+ if (!vectors.size () || endpoint->first != vectors.back ()) {
708
+ vectors.push_back (endpoint->first );
709
+ weights.push_back (endpoint->second );
710
+ } else if (weights.back () > UINT_MAX - endpoint->second ) {
711
+ weights.back () = UINT_MAX;
712
+ } else {
713
+ weights.back () += endpoint->second ;
714
+ }
715
+ }
716
+
717
+ tree_clusterizer<vec6F> vq;
718
+ vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size ), true , m_pTask_pool);
673
719
m_color_clusters.resize (vq.get_codebook_size ());
674
720
675
721
for (uint i = 0 ; i <= m_pTask_pool->get_num_threads (); i++)
@@ -757,7 +803,7 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
757
803
uint b = blocks[i];
758
804
uint weight = encoding_weight[m_block_encodings[b]];
759
805
uint64 selector = 0 ;
760
- for (uint sh = 0 , p = 0 ; p < 16 ; p++, sh += 3 ) {
806
+ for (uint p = 0 ; p < 16 ; p++) {
761
807
uint error_best = cUINT32_MAX;
762
808
uint8 s_best = 0 ;
763
809
for (uint8 t = 0 ; t < 8 ; t++) {
@@ -769,9 +815,9 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
769
815
error_best = error;
770
816
}
771
817
}
772
- selector |= (uint64)s_best << sh ;
818
+ selector = selector << 3 | s_best ;
773
819
}
774
- m_block_selectors[cAlpha0 + a][b] = selector | (uint64)weight << 48 ;
820
+ m_block_selectors[cAlpha0 + a][b] = selector << 16 | weight ;
775
821
}
776
822
}
777
823
@@ -823,18 +869,64 @@ void dxt_hc::determine_alpha_endpoint_clusters_task(uint64 data, void* pData_ptr
823
869
}
824
870
825
871
void dxt_hc::determine_alpha_endpoints () {
826
- tree_clusterizer<vec2F> vq;
872
+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
873
+ crnlib::vector<std::pair<vec2F, uint> > endpoints;
827
874
for (uint a = 0 ; a < m_num_alpha_blocks; a++) {
828
875
for (uint t = 0 ; t < m_tiles.size (); t++) {
829
876
if (m_tiles[t].pixels .size ())
830
- vq. add_training_vec ( m_tiles[t].alpha_endpoints [a], m_tiles[t].pixels .size ());
877
+ endpoints. push_back ( std::make_pair ( m_tiles[t].alpha_endpoints [a], m_tiles[t].pixels .size () ));
831
878
}
832
879
}
833
880
834
- vq.generate_codebook (math::minimum<uint>(m_num_tiles, m_params.m_alpha_endpoint_codebook_size ), false , m_pTask_pool);
881
+ struct Node {
882
+ std::pair<vec2F, uint> *p, *pEnd;
883
+ Node (std::pair<vec2F, uint>* begin, std::pair<vec2F, uint>* end) : p(begin), pEnd(end) {}
884
+ bool operator <(const Node& other) const { return *p > *other.p ; }
885
+ static void sort_task (uint64 data, void * ptr) { std::sort (((Node*)ptr)->p , ((Node*)ptr)->pEnd ); }
886
+ };
887
+
888
+ crnlib::vector<Node> nodes;
889
+ Node node (0 , endpoints.get_ptr ());
890
+ for (uint i = 0 ; i < num_tasks; i++) {
891
+ node.p = node.pEnd ;
892
+ node.pEnd = endpoints.get_ptr () + endpoints.size () * (i + 1 ) / num_tasks;
893
+ if (node.p != node.pEnd )
894
+ nodes.push_back (node);
895
+ }
896
+
897
+ for (uint i = 0 ; i < nodes.size (); i++)
898
+ m_pTask_pool->queue_task (&Node::sort_task, i, &nodes[i]);
899
+ m_pTask_pool->join ();
900
+
901
+ std::priority_queue<Node> queue;
902
+ for (uint i = 0 ; i < nodes.size (); i++)
903
+ queue.push (nodes[i]);
904
+
905
+ crnlib::vector<vec2F> vectors;
906
+ crnlib::vector<uint> weights;
907
+ vectors.reserve (endpoints.size ());
908
+ weights.reserve (endpoints.size ());
909
+ while (queue.size ()) {
910
+ Node node = queue.top ();
911
+ std::pair<vec2F, uint>* endpoint = node.p ++;
912
+ queue.pop ();
913
+ if (node.p != node.pEnd )
914
+ queue.push (node);
915
+ if (!vectors.size () || endpoint->first != vectors.back ()) {
916
+ vectors.push_back (endpoint->first );
917
+ weights.push_back (endpoint->second );
918
+ } else if (weights.back () > UINT_MAX - endpoint->second ) {
919
+ weights.back () = UINT_MAX;
920
+ } else {
921
+ weights.back () += endpoint->second ;
922
+ }
923
+ }
924
+
925
+ tree_clusterizer<vec2F> vq;
926
+ vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), math::minimum<uint>(m_num_tiles, m_params.m_alpha_endpoint_codebook_size ), false , m_pTask_pool);
835
927
m_alpha_clusters.resize (vq.get_codebook_size ());
836
928
837
- for (uint i = 0 ; i <= m_pTask_pool-> get_num_threads () ; i++)
929
+ for (uint i = 0 ; i < num_tasks ; i++)
838
930
m_pTask_pool->queue_object_task (this , &dxt_hc::determine_alpha_endpoint_clusters_task, i, &vq);
839
931
m_pTask_pool->join ();
840
932
@@ -859,7 +951,7 @@ void dxt_hc::determine_alpha_endpoints() {
859
951
}
860
952
}
861
953
862
- for (uint i = 0 ; i <= m_pTask_pool-> get_num_threads () ; i++)
954
+ for (uint i = 0 ; i < num_tasks ; i++)
863
955
m_pTask_pool->queue_object_task (this , &dxt_hc::determine_alpha_endpoint_codebook_task, i, NULL );
864
956
m_pTask_pool->join ();
865
957
}
@@ -911,16 +1003,68 @@ void dxt_hc::create_color_selector_codebook_task(uint64 data, void* pData_ptr) {
911
1003
}
912
1004
}
913
1005
1006
+ struct SelectorNode {
1007
+ uint64 *p, *pEnd;
1008
+ SelectorNode (uint64* begin, uint64* end) : p(begin), pEnd(end) {}
1009
+ bool operator <(const SelectorNode& other) const { return *p > *other.p ; }
1010
+ static void sort_task (uint64 data, void * ptr) { std::sort (((SelectorNode*)ptr)->p , ((SelectorNode*)ptr)->pEnd ); }
1011
+ };
1012
+
914
1013
void dxt_hc::create_color_selector_codebook () {
915
- tree_clusterizer<vec16F> selector_vq;
916
- vec16F v;
917
- for (uint n = m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks, b = 0 ; b < n; b++) {
918
- uint64 selector = m_has_etc_color_blocks ? m_block_selectors[cColor][b << 1 ] | m_block_selectors[cColor][b << 1 | 1 ] << 16 : m_block_selectors[cColor][b];
919
- for (uint8 p = 0 ; p < 16 ; p++, selector >>= 2 )
920
- v[p] = ((selector & 3 ) + 0 .5f ) * 0 .25f ;
921
- selector_vq.add_training_vec (v, m_has_etc_color_blocks ? (selector & 0xFFFF ) + (selector >> 16 ) : selector);
1014
+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
1015
+ crnlib::vector<uint64> selectors (m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks);
1016
+ for (uint i = 0 , b = 0 , step = m_has_etc_color_blocks ? 2 : 1 ; b < m_num_blocks; b += step)
1017
+ selectors[i++] = m_block_selectors[cColor][b] + (m_has_etc_color_blocks ? m_block_selectors[cColor][b + 1 ] : 0 );
1018
+
1019
+ crnlib::vector<SelectorNode> nodes;
1020
+ SelectorNode node (0 , selectors.get_ptr ());
1021
+ for (uint i = 0 ; i < num_tasks; i++) {
1022
+ node.p = node.pEnd ;
1023
+ node.pEnd = selectors.get_ptr () + selectors.size () * (i + 1 ) / num_tasks;
1024
+ if (node.p != node.pEnd )
1025
+ nodes.push_back (node);
922
1026
}
923
- selector_vq.generate_codebook (m_params.m_color_selector_codebook_size , false , m_pTask_pool);
1027
+
1028
+ for (uint i = 0 ; i < nodes.size (); i++)
1029
+ m_pTask_pool->queue_task (&SelectorNode::sort_task, i, &nodes[i]);
1030
+ m_pTask_pool->join ();
1031
+
1032
+ std::priority_queue<SelectorNode> queue;
1033
+ for (uint i = 0 ; i < nodes.size (); i++)
1034
+ queue.push (nodes[i]);
1035
+
1036
+ float v[4 ];
1037
+ for (uint s = 0 ; s < 4 ; s++)
1038
+ v[s] = (s + 0 .5f ) * 0 .25f ;
1039
+
1040
+ crnlib::vector<vec16F> vectors;
1041
+ crnlib::vector<uint> weights;
1042
+ vectors.reserve (selectors.size ());
1043
+ weights.reserve (selectors.size ());
1044
+ for (uint64 prev_selector = 0 ; queue.size ();) {
1045
+ SelectorNode node = queue.top ();
1046
+ uint64 selector = *node.p ++;
1047
+ queue.pop ();
1048
+ if (node.p != node.pEnd )
1049
+ queue.push (node);
1050
+ uint weight = (uint)selector;
1051
+ selector >>= 32 ;
1052
+ if (!vectors.size () || selector != prev_selector) {
1053
+ prev_selector = selector;
1054
+ vec16F vector;
1055
+ for (uint p = 0 ; p < 16 ; p++, selector >>= 2 )
1056
+ vector[15 - p] = v[selector & 3 ];
1057
+ vectors.push_back (vector);
1058
+ weights.push_back (weight);
1059
+ } else if (weights.back () > UINT_MAX - weight) {
1060
+ weights.back () = UINT_MAX;
1061
+ } else {
1062
+ weights.back () += weight;
1063
+ }
1064
+ }
1065
+
1066
+ tree_clusterizer<vec16F> selector_vq;
1067
+ selector_vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), m_params.m_color_selector_codebook_size , false , m_pTask_pool);
924
1068
m_color_selectors.resize (selector_vq.get_codebook_size ());
925
1069
m_color_selectors_used.resize (selector_vq.get_codebook_size ());
926
1070
for (uint i = 0 ; i < selector_vq.get_codebook_size (); i++) {
@@ -930,7 +1074,6 @@ void dxt_hc::create_color_selector_codebook() {
930
1074
m_color_selectors[i] |= (uint)(v[j] * 4 .0f ) << sh;
931
1075
}
932
1076
933
- uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
934
1077
crnlib::vector<crnlib::vector<color_selector_details> > selector_details (num_tasks);
935
1078
for (uint t = 0 ; t < num_tasks; t++) {
936
1079
selector_details[t].resize (m_color_selectors.size ());
@@ -1024,17 +1167,62 @@ void dxt_hc::create_alpha_selector_codebook_task(uint64 data, void* pData_ptr) {
1024
1167
}
1025
1168
1026
1169
void dxt_hc::create_alpha_selector_codebook () {
1027
- tree_clusterizer<vec16F> selector_vq;
1028
- vec16F v;
1029
- for (uint c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) {
1030
- for (uint b = 0 ; b < m_num_blocks; b += m_has_etc_color_blocks ? 2 : 1 ) {
1031
- uint64 selector = m_block_selectors[c][b];
1032
- for (uint8 p = 0 ; p < 16 ; p++, selector >>= 3 )
1033
- v[p] = ((selector & 7 ) + 0 .5f ) * 0 .125f ;
1034
- selector_vq.add_training_vec (v, selector);
1170
+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
1171
+ crnlib::vector<uint64> selectors (m_num_alpha_blocks * (m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks));
1172
+ for (uint i = 0 , c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) {
1173
+ for (uint b = 0 , step = m_has_etc_color_blocks ? 2 : 1 ; b < m_num_blocks; b += step)
1174
+ selectors[i++] = m_block_selectors[c][b];
1175
+ }
1176
+
1177
+ crnlib::vector<SelectorNode> nodes;
1178
+ SelectorNode node (0 , selectors.get_ptr ());
1179
+ for (uint i = 0 ; i < num_tasks; i++) {
1180
+ node.p = node.pEnd ;
1181
+ node.pEnd = selectors.get_ptr () + selectors.size () * (i + 1 ) / num_tasks;
1182
+ if (node.p != node.pEnd )
1183
+ nodes.push_back (node);
1184
+ }
1185
+
1186
+ for (uint i = 0 ; i < nodes.size (); i++)
1187
+ m_pTask_pool->queue_task (&SelectorNode::sort_task, i, &nodes[i]);
1188
+ m_pTask_pool->join ();
1189
+
1190
+ std::priority_queue<SelectorNode> queue;
1191
+ for (uint i = 0 ; i < nodes.size (); i++)
1192
+ queue.push (nodes[i]);
1193
+
1194
+ float v[8 ];
1195
+ for (uint s = 0 ; s < 8 ; s++)
1196
+ v[s] = (s + 0 .5f ) * 0 .125f ;
1197
+
1198
+ crnlib::vector<vec16F> vectors;
1199
+ crnlib::vector<uint> weights;
1200
+ vectors.reserve (selectors.size ());
1201
+ weights.reserve (selectors.size ());
1202
+ for (uint64 prev_selector = 0 ; queue.size ();) {
1203
+ SelectorNode node = queue.top ();
1204
+ uint64 selector = *node.p ++;
1205
+ queue.pop ();
1206
+ if (node.p != node.pEnd )
1207
+ queue.push (node);
1208
+ uint weight = (uint16)selector;
1209
+ selector >>= 16 ;
1210
+ if (!vectors.size () || selector != prev_selector) {
1211
+ prev_selector = selector;
1212
+ vec16F vector;
1213
+ for (uint p = 0 ; p < 16 ; p++, selector >>= 3 )
1214
+ vector[15 - p] = v[selector & 7 ];
1215
+ vectors.push_back (vector);
1216
+ weights.push_back (weight);
1217
+ } else if (weights.back () > UINT_MAX - weight) {
1218
+ weights.back () = UINT_MAX;
1219
+ } else {
1220
+ weights.back () += weight;
1035
1221
}
1036
1222
}
1037
- selector_vq.generate_codebook (m_params.m_alpha_selector_codebook_size , false , m_pTask_pool);
1223
+
1224
+ tree_clusterizer<vec16F> selector_vq;
1225
+ selector_vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), m_params.m_alpha_selector_codebook_size , false , m_pTask_pool);
1038
1226
m_alpha_selectors.resize (selector_vq.get_codebook_size ());
1039
1227
m_alpha_selectors_used.resize (selector_vq.get_codebook_size ());
1040
1228
for (uint i = 0 ; i < selector_vq.get_codebook_size (); i++) {
@@ -1044,7 +1232,6 @@ void dxt_hc::create_alpha_selector_codebook() {
1044
1232
m_alpha_selectors[i] |= (uint64)(v[j] * 8 .0f ) << sh;
1045
1233
}
1046
1234
1047
- uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
1048
1235
crnlib::vector<crnlib::vector<alpha_selector_details> > selector_details (num_tasks);
1049
1236
for (uint t = 0 ; t < num_tasks; t++) {
1050
1237
selector_details[t].resize (m_alpha_selectors.size ());
0 commit comments