fix: tensor loading thread count (leejet#854)

wbruna · web-flow · commit f3140eadbb65 · 2025-09-25T00:26:38.000+08:00
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -141,7 +141,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
             return true;
         };
-        model_loader.load_tensors(on_load);
+        model_loader.load_tensors(on_load, 1);
         readed_embeddings.push_back(embd_name);
         if (embd) {
             int64_t hidden_size = text_model->model.hidden_size;
diff --git a/control.hpp b/control.hpp
@@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner {
         guided_hint_cached = true;
     }
 
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
         LOG_INFO("loading control net from '%s'", file_path.c_str());
         alloc_params_buffer();
         std::map<std::string, ggml_tensor*> tensors;
@@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
 
         if (!success) {
             LOG_ERROR("load control net tensors from model loader failed");
diff --git a/esrgan.hpp b/esrgan.hpp
@@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner {
         return "esrgan";
     }
 
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
         LOG_INFO("loading esrgan from '%s'", file_path.c_str());
 
         alloc_params_buffer();
@@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(esrgan_tensors);
+        bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
 
         if (!success) {
             LOG_ERROR("load esrgan tensors from model loader failed");
diff --git a/lora.hpp b/lora.hpp
@@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner {
         return "lora";
     }
 
-    bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
+    bool load_from_file(bool filter_tensor, int n_threads) {
         LOG_INFO("loading LoRA from '%s'", file_path.c_str());
 
         if (load_failed) {
diff --git a/model.cpp b/model.cpp
@@ -1957,7 +1957,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
     std::atomic<int64_t> copy_to_backend_time_ms(0);
     std::atomic<int64_t> convert_time_ms(0);
 
-    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores();
+    LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
 
     int64_t start_time = ggml_time_ms();
     std::vector<TensorStorage> processed_tensor_storages;
diff --git a/pmid.hpp b/pmid.hpp
@@ -591,7 +591,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
         return "id_embeds";
     }
 
-    bool load_from_file(bool filter_tensor = false) {
+    bool load_from_file(bool filter_tensor, int n_threads) {
         LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
 
         if (load_failed) {
@@ -623,11 +623,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
             return true;
         };
 
-        model_loader->load_tensors(on_new_tensor_cb);
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
         alloc_params_buffer();
 
         dry_run = false;
-        model_loader->load_tensors(on_new_tensor_cb);
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
 
         LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
         return true;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -531,7 +531,7 @@ class StableDiffusionGGML {
             }
             if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
                 pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
-                if (!pmid_lora->load_from_file(true)) {
+                if (!pmid_lora->load_from_file(true, n_threads)) {
                     LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
                     return false;
                 }
@@ -599,14 +599,14 @@ class StableDiffusionGGML {
             if (!use_tiny_autoencoder) {
                 vae_params_mem_size = first_stage_model->get_params_buffer_size();
             } else {
-                if (!tae_first_stage->load_from_file(taesd_path)) {
+                if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
                     return false;
                 }
                 vae_params_mem_size = tae_first_stage->get_params_buffer_size();
             }
             size_t control_net_params_mem_size = 0;
             if (control_net) {
-                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {
+                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
                     return false;
                 }
                 control_net_params_mem_size = control_net->get_params_buffer_size();
@@ -836,7 +836,7 @@ class StableDiffusionGGML {
             return;
         }
         LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
-        if (!lora.load_from_file()) {
+        if (!lora.load_from_file(false, n_threads)) {
             LOG_WARN("load lora tensors from %s failed", file_path.c_str());
             return;
         }
diff --git a/tae.hpp b/tae.hpp
@@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
         return "taesd";
     }
 
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
         LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
         alloc_params_buffer();
         std::map<std::string, ggml_tensor*> taesd_tensors;
@@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
 
         if (!success) {
             LOG_ERROR("load tae tensors from model loader failed");
diff --git a/upscaler.cpp b/upscaler.cpp
@@ -18,7 +18,8 @@ struct UpscalerGGML {
     }
 
     bool load_from_file(const std::string& esrgan_path,
-                        bool offload_params_to_cpu) {
+                        bool offload_params_to_cpu,
+                        int n_threads) {
         ggml_log_set(ggml_log_callback_default, nullptr);
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
@@ -54,7 +55,7 @@ struct UpscalerGGML {
         if (direct) {
             esrgan_upscaler->enable_conv2d_direct();
         }
-        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
+        if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
             return false;
         }
         return true;
@@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
         return NULL;
     }
 
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
         delete upscaler_ctx->upscaler;
         upscaler_ctx->upscaler = NULL;
         free(upscaler_ctx);

Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {`
`141`	`141`	`}`
`142`	`142`	`return true;`
`143`	`143`	`};`
`144`		`- model_loader.load_tensors(on_load);`
	`144`	`+ model_loader.load_tensors(on_load, 1);`
`145`	`145`	`readed_embeddings.push_back(embd_name);`
`146`	`146`	`if (embd) {`
`147`	`147`	`int64_t hidden_size = text_model->model.hidden_size;`
Original file line number	Diff line number	Diff line change
`@@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner {`
`445`	`445`	`guided_hint_cached = true;`
`446`	`446`	`}`
`447`	`447`
`448`		`- bool load_from_file(const std::string& file_path) {`
	`448`	`+ bool load_from_file(const std::string& file_path, int n_threads) {`
`449`	`449`	`LOG_INFO("loading control net from '%s'", file_path.c_str());`
`450`	`450`	`alloc_params_buffer();`
`451`	`451`	`std::map<std::string, ggml_tensor*> tensors;`
`@@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner {`
`458`	`458`	`return false;`
`459`	`459`	`}`
`460`	`460`
`461`		`- bool success = model_loader.load_tensors(tensors, ignore_tensors);`
	`461`	`+ bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);`
`462`	`462`
`463`	`463`	`if (!success) {`
`464`	`464`	`LOG_ERROR("load control net tensors from model loader failed");`
Original file line number	Diff line number	Diff line change
`@@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner {`
`164`	`164`	`return "esrgan";`
`165`	`165`	`}`
`166`	`166`
`167`		`- bool load_from_file(const std::string& file_path) {`
	`167`	`+ bool load_from_file(const std::string& file_path, int n_threads) {`
`168`	`168`	`LOG_INFO("loading esrgan from '%s'", file_path.c_str());`
`169`	`169`
`170`	`170`	`alloc_params_buffer();`
`@@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner {`
`177`	`177`	`return false;`
`178`	`178`	`}`
`179`	`179`
`180`		`- bool success = model_loader.load_tensors(esrgan_tensors);`
	`180`	`+ bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);`
`181`	`181`
`182`	`182`	`if (!success) {`
`183`	`183`	`LOG_ERROR("load esrgan tensors from model loader failed");`
Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner {`
`116`	`116`	`return "lora";`
`117`	`117`	`}`
`118`	`118`
`119`		`- bool load_from_file(bool filter_tensor = false, int n_threads = 0) {`
	`119`	`+ bool load_from_file(bool filter_tensor, int n_threads) {`
`120`	`120`	`LOG_INFO("loading LoRA from '%s'", file_path.c_str());`
`121`	`121`
`122`	`122`	`if (load_failed) {`
Original file line number	Diff line number	Diff line change
`@@ -531,7 +531,7 @@ class StableDiffusionGGML {`
`531`	`531`	`}`
`532`	`532`	`if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {`
`533`	`533`	`pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");`
`534`		`- if (!pmid_lora->load_from_file(true)) {`
	`534`	`+ if (!pmid_lora->load_from_file(true, n_threads)) {`
`535`	`535`	`LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);`
`536`	`536`	`return false;`
`537`	`537`	`}`
`@@ -599,14 +599,14 @@ class StableDiffusionGGML {`
`599`	`599`	`if (!use_tiny_autoencoder) {`
`600`	`600`	`vae_params_mem_size = first_stage_model->get_params_buffer_size();`
`601`	`601`	`} else {`
`602`		`- if (!tae_first_stage->load_from_file(taesd_path)) {`
	`602`	`+ if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {`
`603`	`603`	`return false;`
`604`	`604`	`}`
`605`	`605`	`vae_params_mem_size = tae_first_stage->get_params_buffer_size();`
`606`	`606`	`}`
`607`	`607`	`size_t control_net_params_mem_size = 0;`
`608`	`608`	`if (control_net) {`
`609`		`- if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {`
	`609`	`+ if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {`
`610`	`610`	`return false;`
`611`	`611`	`}`
`612`	`612`	`control_net_params_mem_size = control_net->get_params_buffer_size();`
`@@ -836,7 +836,7 @@ class StableDiffusionGGML {`
`836`	`836`	`return;`
`837`	`837`	`}`
`838`	`838`	`LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");`
`839`		`- if (!lora.load_from_file()) {`
	`839`	`+ if (!lora.load_from_file(false, n_threads)) {`
`840`	`840`	`LOG_WARN("load lora tensors from %s failed", file_path.c_str());`
`841`	`841`	`return;`
`842`	`842`	`}`
Original file line number	Diff line number	Diff line change
`@@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {`
`222`	`222`	`return "taesd";`
`223`	`223`	`}`
`224`	`224`
`225`		`- bool load_from_file(const std::string& file_path) {`
	`225`	`+ bool load_from_file(const std::string& file_path, int n_threads) {`
`226`	`226`	`LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");`
`227`	`227`	`alloc_params_buffer();`
`228`	`228`	`std::map<std::string, ggml_tensor*> taesd_tensors;`
`@@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner {`
`238`	`238`	`return false;`
`239`	`239`	`}`
`240`	`240`
`241`		`- bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);`
	`241`	`+ bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);`
`242`	`242`
`243`	`243`	`if (!success) {`
`244`	`244`	`LOG_ERROR("load tae tensors from model loader failed");`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,8 @@ struct UpscalerGGML {`
`18`	`18`	`}`
`19`	`19`
`20`	`20`	`bool load_from_file(const std::string& esrgan_path,`
`21`		`- bool offload_params_to_cpu) {`
	`21`	`+ bool offload_params_to_cpu,`
	`22`	`+ int n_threads) {`
`22`	`23`	`ggml_log_set(ggml_log_callback_default, nullptr);`
`23`	`24`	`#ifdef SD_USE_CUDA`
`24`	`25`	`LOG_DEBUG("Using CUDA backend");`
`@@ -54,7 +55,7 @@ struct UpscalerGGML {`
`54`	`55`	`if (direct) {`
`55`	`56`	`esrgan_upscaler->enable_conv2d_direct();`
`56`	`57`	`}`
`57`		`- if (!esrgan_upscaler->load_from_file(esrgan_path)) {`
	`58`	`+ if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {`
`58`	`59`	`return false;`
`59`	`60`	`}`
`60`	`61`	`return true;`
`@@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,`
`124`	`125`	`return NULL;`
`125`	`126`	`}`
`126`	`127`
`127`		`- if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {`
	`128`	`+ if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {`
`128`	`129`	`delete upscaler_ctx->upscaler;`
`129`	`130`	`upscaler_ctx->upscaler = NULL;`
`130`	`131`	`free(upscaler_ctx);`