@@ -466,9 +466,7 @@ struct llm_graph_params {
466
466
467
467
class llm_graph_result : public llm_graph_result_i {
468
468
public:
469
- llm_graph_result (int64_t max_nodes) : max_nodes(max_nodes) {
470
- reset ();
471
- }
469
+ llm_graph_result (int64_t max_nodes);
472
470
473
471
virtual ~llm_graph_result () = default ;
474
472
@@ -480,60 +478,20 @@ class llm_graph_result : public llm_graph_result_i {
480
478
ggml_cgraph * get_gf () override { return gf; }
481
479
ggml_context * get_ctx () override { return ctx_compute.get (); }
482
480
483
- int64_t get_max_nodes () const {
484
- return max_nodes;
485
- }
486
-
487
- void reset () override {
488
- t_tokens = nullptr ;
489
- t_logits = nullptr ;
490
- t_embd = nullptr ;
491
- t_embd_pooled = nullptr ;
492
-
493
- inputs.clear ();
494
-
495
- buf_compute_meta.resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
481
+ int64_t get_max_nodes () const ;
496
482
497
- ggml_init_params params = {
498
- /* .mem_size =*/ buf_compute_meta.size (),
499
- /* .mem_buffer =*/ buf_compute_meta.data (),
500
- /* .no_alloc =*/ true ,
501
- };
483
+ void reset () override ;
502
484
503
- ctx_compute.reset (ggml_init (params));
504
-
505
- gf = ggml_new_graph_custom (ctx_compute.get (), max_nodes, false );
506
- }
507
-
508
- void set_inputs (const llama_ubatch * ubatch) override {
509
- for (auto & input : inputs) {
510
- input->set_input (ubatch);
511
- }
512
- }
485
+ void set_inputs (const llama_ubatch * ubatch) override ;
513
486
514
487
// try to update the existing graph result using the new graph parameters in order to reuse it
515
488
// this can only be done if we determine that the resulting graph using the new graph parameters
516
489
// would be identical to the existing graph. in that case, we simply have to update the memory
517
490
// contexts of the input tensors of the graph and we can reuse it for another computation
518
491
// return true if the graph was updated and can be reused
519
- bool can_reuse (const llm_graph_params & params) override {
520
- if (!this ->params .allow_reuse (params)) {
521
- return false ;
522
- }
523
-
524
- bool res = true ;
525
-
526
- for (auto & input : inputs) {
527
- res &= input->can_reuse (params);
528
- }
492
+ bool can_reuse (const llm_graph_params & params) override ;
529
493
530
- return res;
531
- }
532
-
533
- llm_graph_input_i * add_input (llm_graph_input_ptr input) {
534
- inputs.emplace_back (std::move (input));
535
- return inputs.back ().get ();
536
- }
494
+ llm_graph_input_i * add_input (llm_graph_input_ptr input);
537
495
538
496
// important graph nodes
539
497
ggml_tensor * t_tokens = nullptr ;
@@ -556,6 +514,9 @@ class llm_graph_result : public llm_graph_result_i {
556
514
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
557
515
// note: these are updated after constructing the new graph
558
516
llm_graph_params params;
517
+
518
+ // env: LLAMA_GRAPH_RESULT_DEBUG
519
+ int debug = 0 ;
559
520
};
560
521
561
522
//
0 commit comments