stduhpf
diff --git a/‎diffusion_model.hpp‎
Lines changed: 29 additions & 29 deletions b/‎diffusion_model.hpp‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎examples/cli/main.cpp‎
Lines changed: 75 additions & 46 deletions b/‎examples/cli/main.cpp‎
Lines changed: 75 additions & 46 deletions
diff --git a/‎examples/server/main.cpp‎
Lines changed: 1 addition & 2 deletions b/‎examples/server/main.cpp‎
Lines changed: 1 addition & 2 deletions
@@ -13,13 +13,13 @@ struct DiffusionModel {
                          struct ggml_tensor* c_concat,
                          struct ggml_tensor* y,
                          struct ggml_tensor* guidance,
-                         int num_video_frames                          = -1,
-                         std::vector<struct ggml_tensor*> controls     = {},
-                         float control_strength                        = 0.f,
-                         std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
-                         struct ggml_tensor** output                   = NULL,
-                         struct ggml_context* output_ctx               = NULL,
-                         std::vector<int> skip_layers                  = std::vector<int>())             = 0;
+                         std::vector<ggml_tensor*> ref_latents = {},
+                         int num_video_frames                      = -1,
+                         std::vector<struct ggml_tensor*> controls = {},
+                         float control_strength                    = 0.f,
+                         struct ggml_tensor** output               = NULL,
+                         struct ggml_context* output_ctx           = NULL,
+                         std::vector<int> skip_layers              = std::vector<int>())             = 0;
     virtual void alloc_params_buffer()                                                  = 0;
     virtual void free_params_buffer()                                                   = 0;
     virtual void free_compute_buffer()                                                  = 0;
@@ -69,13 +69,13 @@ struct UNetModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 int num_video_frames                          = -1,
-                 std::vector<struct ggml_tensor*> controls     = {},
-                 float control_strength                        = 0.f,
-                 std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
-                 struct ggml_tensor** output                   = NULL,
-                 struct ggml_context* output_ctx               = NULL,
-                 std::vector<int> skip_layers                  = std::vector<int>()) {
+                 std::vector<ggml_tensor*> ref_latents = {},
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
         (void)skip_layers;  // SLG doesn't work with UNet models
         return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
     }
@@ -120,13 +120,13 @@ struct MMDiTModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 int num_video_frames                      = -1,
-                 std::vector<struct ggml_tensor*> controls = {},
-                 float control_strength                    = 0.f,
-                 std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
-                 struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL,
-                 std::vector<int> skip_layers              = std::vector<int>()) {
+                 std::vector<struct ggml_tensor*> ref_latents = {},
+                 int num_video_frames                          = -1,
+                 std::vector<struct ggml_tensor*> controls     = {},
+                 float control_strength                        = 0.f,
+                 struct ggml_tensor** output                   = NULL,
+                 struct ggml_context* output_ctx               = NULL,
+                 std::vector<int> skip_layers                  = std::vector<int>()) {
         return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
     }
 };
@@ -172,14 +172,14 @@ struct FluxModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 int num_video_frames                          = -1,
-                 std::vector<struct ggml_tensor*> controls     = {},
-                 float control_strength                        = 0.f,
-                 std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
-                 struct ggml_tensor** output                   = NULL,
-                 struct ggml_context* output_ctx               = NULL,
-                 std::vector<int> skip_layers                  = std::vector<int>()) {
-        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, kontext_imgs, output, output_ctx, skip_layers);
+                 std::vector<ggml_tensor*> ref_latents = {},
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
     }
 };
 
 
@@ -57,6 +57,7 @@ const char* modes_str[] = {
     "txt2img",
     "img2img",
     "img2vid",
+    "edit",
     "convert",
 };
 
@@ -71,6 +72,7 @@ enum SDMode {
     TXT2IMG,
     IMG2IMG,
     IMG2VID,
+    EDIT,
     CONVERT,
     MODE_COUNT
 };
@@ -96,8 +98,7 @@ struct SDParams {
     std::string input_path;
     std::string mask_path;
     std::string control_image_path;
-
-    std::vector<std::string> kontext_image_paths;
+    std::vector<std::string> ref_image_paths;
 
     std::string prompt;
     std::string negative_prompt;
@@ -181,6 +182,10 @@ void print_params(SDParams params) {
     printf("    init_img:          %s\n", params.input_path.c_str());
     printf("    mask_img:          %s\n", params.mask_path.c_str());
     printf("    control_image:     %s\n", params.control_image_path.c_str());
+    printf("    ref_images_paths:\n");
+    for (auto& path : params.ref_image_paths) {
+        printf("        %s\n", path.c_str());
+    };
     printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
@@ -241,6 +246,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
     printf("  --control-image [IMAGE]            path to image condition, control net\n");
+    printf("  -r, --ref_image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
     printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
     printf("  -p, --prompt [PROMPT]              the prompt to render\n");
     printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
@@ -289,9 +295,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]);
     printf("  --preview-interval [N]             How often to save the image preview");
     printf("  --preview-path [PATH}              path to write preview image to (default: ./preview.png)\n");
-    printf("  --color                            Colors the logging tags according to level\n");
+    printf("  --color                            colors the logging tags according to level\n");
     printf("  -v, --verbose                      print extra info\n");
-    printf("  -ki, --kontext_img [PATH]        Reference image for Flux Kontext models (can be used multiple times) \n");
 }
 
 void parse_args(int argc, const char** argv, SDParams& params) {
@@ -727,12 +732,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.imatrix_in.push_back(std::string(argv[i]));
-        } else if (arg == "-ki" || arg == "--kontext-img") {
+        } else if (arg == "-r" || arg == "--ref-image") {
             if (++i >= argc) {
                 invalid_arg = true;
                 break;
             }
-            params.kontext_image_paths.push_back(argv[i]);
+            params.ref_image_paths.push_back(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -797,7 +802,13 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     }
 
     if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
-        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
+        fprintf(stderr, "error: when using the img2img/img2vid mode, the following arguments are required: init-img\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.mode == EDIT && params.ref_image_paths.size() == 0) {
+        fprintf(stderr, "error: when using the edit mode, the following arguments are required: ref-image\n");
         print_usage(argc, argv);
         exit(1);
     }
@@ -1014,43 +1025,12 @@ int main(int argc, const char* argv[]) {
         fprintf(stderr, "SVD support is broken, do not use it!!!\n");
         return 1;
     }
-    bool vae_decode_only          = true;
-
-    std::vector<sd_image_t> kontext_imgs;
-    for (auto& path : params.kontext_image_paths) {
-        vae_decode_only = false;
-        int c                 = 0;
-        int width             = 0;
-        int height            = 0;
-        uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
-        if (image_buffer == NULL) {
-            fprintf(stderr, "load image from '%s' failed\n", path.c_str());
-            return 1;
-        }
-        if (c < 3) {
-            fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
-            free(image_buffer);
-            return 1;
-        }
-        if (width <= 0) {
-            fprintf(stderr, "error: the width of image must be greater than 0\n");
-            free(image_buffer);
-            return 1;
-        }
-        if (height <= 0) {
-            fprintf(stderr, "error: the height of image must be greater than 0\n");
-            free(image_buffer);
-            return 1;
-        }
-        kontext_imgs.push_back({(uint32_t)width,
-                                (uint32_t)height,
-                                3,
-                                image_buffer});
-    }
 
+    bool vae_decode_only          = true;
     uint8_t* input_image_buffer   = NULL;
     uint8_t* control_image_buffer = NULL;
     uint8_t* mask_image_buffer    = NULL;
+    std::vector<sd_image_t> ref_images;
 
     if (params.mode == IMG2IMG || params.mode == IMG2VID) {
         vae_decode_only = false;
@@ -1102,6 +1082,37 @@ int main(int argc, const char* argv[]) {
             free(input_image_buffer);
             input_image_buffer = resized_image_buffer;
         }
+    } else if (params.mode == EDIT) {
+        vae_decode_only = false;
+        for (auto& path : params.ref_image_paths) {
+            int c                 = 0;
+            int width             = 0;
+            int height            = 0;
+            uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
+            if (image_buffer == NULL) {
+                fprintf(stderr, "load image from '%s' failed\n", path.c_str());
+                return 1;
+            }
+            if (c < 3) {
+                fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
+                free(image_buffer);
+                return 1;
+            }
+            if (width <= 0) {
+                fprintf(stderr, "error: the width of image must be greater than 0\n");
+                free(image_buffer);
+                return 1;
+            }
+            if (height <= 0) {
+                fprintf(stderr, "error: the height of image must be greater than 0\n");
+                free(image_buffer);
+                return 1;
+            }
+            ref_images.push_back({(uint32_t)width,
+                                  (uint32_t)height,
+                                  3,
+                                  image_buffer});
+        }
     }
 
     sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
@@ -1187,9 +1198,8 @@ int main(int argc, const char* argv[]) {
                           params.control_strength,
                           params.style_ratio,
                           params.normalize_input,
-                          params.input_id_images_path.c_str(),
-                          kontext_imgs.data(), kontext_imgs.size());
-    } else {
+                          params.input_id_images_path.c_str());
+    } else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
                                   3,
@@ -1250,9 +1260,28 @@ int main(int argc, const char* argv[]) {
                               params.control_strength,
                               params.style_ratio,
                               params.normalize_input,
-                              params.input_id_images_path.c_str(),
-                              kontext_imgs.data(), kontext_imgs.size());
+                              params.input_id_images_path.c_str());
         }
+    } else {  // EDIT
+        results = edit(sd_ctx,
+                       ref_images.data(),
+                       ref_images.size(),
+                       params.prompt.c_str(),
+                       params.negative_prompt.c_str(),
+                       params.clip_skip,
+                       guidance_params,
+                       params.eta,
+                       params.width,
+                       params.height,
+                       params.sample_method,
+                       params.sample_steps,
+                       params.seed,
+                       params.batch_count,
+                       control_image,
+                       params.control_strength,
+                       params.style_ratio,
+                       params.normalize_input,
+                       params.input_id_images_path.c_str());
     }
 
     if (results == NULL) {
@@ -1335,4 +1364,4 @@ int main(int argc, const char* argv[]) {
     free(input_image_buffer);
 
     return 0;
-}
+}
@@ -1516,8 +1516,7 @@ void start_server(SDParams params) {
                                   1,
                                   params.lastRequest.style_ratio,
                                   params.lastRequest.normalize_input,
-                                  params.input_id_images_path.c_str(),
-                                  NULL, 0);
+                                  params.input_id_images_path.c_str());
 
                 if (results == NULL) {
                     printf("generate failed\n");