@@ -598,8 +598,40 @@ __STATIC_INLINE__ void ggml_tensor_scale_output(struct ggml_tensor* src) {
598598
599599typedef std::function<void (ggml_tensor*, ggml_tensor*, bool )> on_tile_process;
600600
601+ __STATIC_INLINE__ void
602+ sd_tiling_calc_tiles (int &num_tiles_dim, float & tile_overlap_factor_dim, int small_dim, int tile_size, const float tile_overlap_factor) {
603+
604+ int tile_overlap = (tile_size * tile_overlap_factor);
605+ int non_tile_overlap = tile_size - tile_overlap;
606+
607+ num_tiles_dim = (small_dim - tile_overlap) / non_tile_overlap;
608+ int overshoot_dim = ((num_tiles_dim + 1 ) * non_tile_overlap + tile_overlap) % small_dim;
609+
610+ if ((overshoot_dim != non_tile_overlap) && (overshoot_dim <= num_tiles_dim * (tile_size / 2 - tile_overlap))) {
611+ // if tiles don't fit perfectly using the desired overlap
612+ // and there is enough room to squeeze an extra tile without overlap becoming >0.5
613+ num_tiles_dim++;
614+ }
615+
616+ tile_overlap_factor_dim = (float )(tile_size * num_tiles_dim - small_dim) / (float )(tile_size * (num_tiles_dim - 1 ));
617+ if (num_tiles_dim <= 2 ) {
618+ if (small_dim <= tile_size) {
619+ num_tiles_dim = 1 ;
620+ tile_overlap_factor_dim = 0 ;
621+ } else {
622+ num_tiles_dim = 2 ;
623+ tile_overlap_factor_dim = (2 * tile_size - small_dim) / (float )tile_size;
624+ }
625+ }
626+ }
627+
601628// Tiling
602- __STATIC_INLINE__ void sd_tiling (ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
629+ __STATIC_INLINE__ void sd_tiling_non_square (ggml_tensor* input, ggml_tensor* output, const int scale,
630+ const int p_tile_size_x, const int p_tile_size_y,
631+ const float tile_overlap_factor, on_tile_process on_processing) {
632+
633+ output = ggml_set_f32 (output, 0 );
634+
603635 int input_width = (int )input->ne [0 ];
604636 int input_height = (int )input->ne [1 ];
605637 int output_width = (int )output->ne [0 ];
@@ -618,62 +650,27 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
618650 small_height = input_height;
619651 }
620652
621- int tile_overlap = (tile_size * tile_overlap_factor);
622- int non_tile_overlap = tile_size - tile_overlap;
653+ int num_tiles_x;
654+ float tile_overlap_factor_x;
655+ sd_tiling_calc_tiles (num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
623656
624- int num_tiles_x = (small_width - tile_overlap) / non_tile_overlap;
625- int overshoot_x = ((num_tiles_x + 1 ) * non_tile_overlap + tile_overlap) % small_width;
626-
627- if ((overshoot_x != non_tile_overlap) && (overshoot_x <= num_tiles_x * (tile_size / 2 - tile_overlap))) {
628- // if tiles don't fit perfectly using the desired overlap
629- // and there is enough room to squeeze an extra tile without overlap becoming >0.5
630- num_tiles_x++;
631- }
632-
633- float tile_overlap_factor_x = (float )(tile_size * num_tiles_x - small_width) / (float )(tile_size * (num_tiles_x - 1 ));
634- if (num_tiles_x <= 2 ) {
635- if (small_width <= tile_size) {
636- num_tiles_x = 1 ;
637- tile_overlap_factor_x = 0 ;
638- } else {
639- num_tiles_x = 2 ;
640- tile_overlap_factor_x = (2 * tile_size - small_width) / (float )tile_size;
641- }
642- }
643-
644- int num_tiles_y = (small_height - tile_overlap) / non_tile_overlap;
645- int overshoot_y = ((num_tiles_y + 1 ) * non_tile_overlap + tile_overlap) % small_height;
646-
647- if ((overshoot_y != non_tile_overlap) && (overshoot_y <= num_tiles_y * (tile_size / 2 - tile_overlap))) {
648- // if tiles don't fit perfectly using the desired overlap
649- // and there is enough room to squeeze an extra tile without overlap becoming >0.5
650- num_tiles_y++;
651- }
652-
653- float tile_overlap_factor_y = (float )(tile_size * num_tiles_y - small_height) / (float )(tile_size * (num_tiles_y - 1 ));
654- if (num_tiles_y <= 2 ) {
655- if (small_height <= tile_size) {
656- num_tiles_y = 1 ;
657- tile_overlap_factor_y = 0 ;
658- } else {
659- num_tiles_y = 2 ;
660- tile_overlap_factor_y = (2 * tile_size - small_height) / (float )tile_size;
661- }
662- }
657+ int num_tiles_y;
658+ float tile_overlap_factor_y;
659+ sd_tiling_calc_tiles (num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
663660
664661 LOG_DEBUG (" num tiles : %d, %d " , num_tiles_x, num_tiles_y);
665662 LOG_DEBUG (" optimal overlap : %f, %f (targeting %f)" , tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
666663
667664 GGML_ASSERT (input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0 ); // should be multiple of 2
668665
669- int tile_overlap_x = (int32_t )(tile_size * tile_overlap_factor_x);
670- int non_tile_overlap_x = tile_size - tile_overlap_x;
666+ int tile_overlap_x = (int32_t )(p_tile_size_x * tile_overlap_factor_x);
667+ int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
671668
672- int tile_overlap_y = (int32_t )(tile_size * tile_overlap_factor_y);
673- int non_tile_overlap_y = tile_size - tile_overlap_y;
669+ int tile_overlap_y = (int32_t )(p_tile_size_y * tile_overlap_factor_y);
670+ int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
674671
675- int tile_size_x = tile_size < small_width ? tile_size : small_width;
676- int tile_size_y = tile_size < small_height ? tile_size : small_height;
672+ int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
673+ int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
677674
678675 int input_tile_size_x = tile_size_x;
679676 int input_tile_size_y = tile_size_y;
@@ -762,6 +759,11 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
762759 ggml_free (tiles_ctx);
763760}
764761
762+ __STATIC_INLINE__ void sd_tiling (ggml_tensor* input, ggml_tensor* output, const int scale,
763+ const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
764+ sd_tiling_non_square (input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
765+ }
766+
765767__STATIC_INLINE__ struct ggml_tensor * ggml_group_norm_32 (struct ggml_context * ctx,
766768 struct ggml_tensor * a) {
767769 const float eps = 1e-6f ; // default eps parameter
0 commit comments