I have a C++ script, where I load a python file to execute a function to evaluate a Graph Neural Network. This function gets executed to evaluate data in an event loop, so the script gets executed for every event. I have the problem that the initialization takes quite long. When I want to evaluate the model in the python code the first time it takes ~8 seconds, if I do it again immediately again, it takes only 0.005 seconds, which is great. What can I do to speed this initialization since I need to loop over millions of events :\
This is the C++ code
//arachne
#include "ArachneWupperFlow.hh"
#include "pic.h"
//std/sys
#include <iostream>
#include <stdio.h>
#include <sys/stat.h>
//python
#define PY_SSIZE_T_CLEAN
#include <Python.h>
//root
#include "TSystem.h"
ArachneWupperFlow::ArachneWupperFlow()
{
  runWupperFlow = false; //flag to run WupperFlow
  runMode = "DNN"; //Possible run-modes are DNN or GNN
  modelFilepath = ""; //path to ONNX model
  num_inputs = -999; //number of input variables
  nnoutbins = 100; //number of nnout-bins
  nnout = -999;    //nnout variable
  nvar = -1;       //nvar     
  GNNnvar = -1;    //number of elements in the GNNarray per event
  num_classes = 1; 
  name = "";
  train_tree_name = "";
}
ArachneWupperFlow::~ArachneWupperFlow()
{
}
void ArachneWupperFlow::setUpWupperFlow()
{
  if (runMode == "GNN") {
    std::cout<<"Running in mode GNN. Preparing everything for interference of the GNN model."<<std::endl;
    //Print Logo
    print_WupperFlow("GNN Evaluator");
    //Initialize the python objects
    PyObject *pName, *pModule;//, *pFunc;
    PyObject *pArgs, *pValue;
    PyObject* GNNList = PyList_New(0);
    //Initialize the interpreter
    Py_Initialize();
    //Load the python module
    pName = PyUnicode_DecodeFSDefault("scripts.python.WupperFlowEvaluator");
    //Import the function from the python module
    pModule = PyImport_Import(pName);
    Py_DECREF(pName);
    if (pModule != NULL) {
      //Obtain the function from the python module
      //pFunc is a new reference
      pFunc = PyObject_GetAttrString(pModule, "main");
      if (pFunc && PyCallable_Check(pFunc)) {
        //Retrieve the arguments to be passed to the function
        pArgs = PyTuple_New(2);
        //Convert the C++ std::vector with the GNN input-vars to a python list
        int size = GNNnvar;
        if (size == -1) {
          std::cout<<"Please provide the number of elements in the GNNarray with the GNNnvar parameter!"<<std::endl;
        } else {
          //Fill the python list with the GNN input-vars
          for (int i=0; i<size; i++) {
            PyList_Append(GNNList, PyFloat_FromDouble(2.0));
          }
        }
        //Fill the arguments for the python function, GNN input-list and event number
        PyTuple_SetItem(pArgs, 0, GNNList);
        PyTuple_SetItem(pArgs, 1, PyLong_FromLong(345654323));
        //Call the function and get the output
        pValue = PyObject_CallObject(pFunc, pArgs);
        Py_DECREF(pArgs);
        Py_DECREF(GNNList);
        if (pValue != NULL) {
          //Get the output score of the GNN interference
          nnout = PyFloat_AsDouble(pValue);
          Py_DECREF(pValue);
        } else {
          Py_DECREF(pFunc);
          Py_DECREF(pModule);
          PyErr_Print();
          fprintf(stderr,"Call failed\n");
        }
      }
      else {
        if (PyErr_Occurred()) {
          PyErr_Print();
        }
        fprintf(stderr, "Cannot find function \"%s\"\n", "main");
      }
      // Py_XDECREF(pFunc);
      Py_DECREF(pModule);
    } else {
      PyErr_Print();
      fprintf(stderr, "Failed to load \"%s\"\n", "scripts.python.WupperFlowEvaluator");
    }
  }
}
double ArachneWupperFlow::evaluate(float* inputarray, ULong_t event_number)
{
  if (runMode == "DNN") {
    
  } else if (runMode == "GNN") {
    PyObject *pArgs, *pValue;
    PyObject* GNNList = PyList_New(0);
    // Retrieve the arguments to be passed to the function
    pArgs = PyTuple_New(2);
    //Convert the C++ std::vector with the GNN input-vars to a python list
    int size = GNNnvar;
    if (size == -1) {
      std::cout<<"Please provide the number of elements in the GNNarray with the GNNnvar parameter!"<<std::endl;
    } else {
      //Fill the python list with the GNN input-vars
      for (int i=0; i<size; i++) {
        PyList_Append(GNNList, PyFloat_FromDouble(inputarray[i]));
      }
    }
    //Fill the arguments for the python function, GNN input-list and event number
    PyTuple_SetItem(pArgs, 0, GNNList);
    PyTuple_SetItem(pArgs, 1, PyLong_FromLong(event_number));
    //Call the function and get the output
    pValue = PyObject_CallObject(pFunc, pArgs);
    Py_DECREF(pArgs);
    Py_DECREF(GNNList);
    if (pValue != NULL) {
      //Get the output score of the GNN interference
      nnout = PyFloat_AsDouble(pValue);
      Py_DECREF(pValue);
    } else {
      // Py_DECREF(pFunc);
      // Py_DECREF(pModule);
      PyErr_Print();
      fprintf(stderr,"Call failed\n");
    }
    
  }
  //Return the output score
  std::cout<<nnout<<std::endl;
  return nnout;
}
And this is the python part:
import os
import numpy as np
import pandas as pd
import sonnet as snt
import tensorflow as tf
from graph_nets import utils_tf
from graph_nets import graphs
from graph_nets import modules
def calc_dphi_array(phi1,phi2):
  ...
def make_graph(event):
  ...
class MyMLP(snt.Module):
  @tf.function()
  def __init__(self,latent_size,num_layers,dropout,activation):
    super(MyMLP, self).__init__(name=None)
    self.mlp = snt.nets.MLP([latent_size] * num_layers, activate_final=True, dropout_rate=dropout, w_init = None, b_init = None, activation = activation)
    self.ln = snt.LayerNorm(axis=-1, create_scale=True, create_offset=False)
    self.use_dropout = (dropout != 0)
  @tf.function()
  def __call__(self, inputs):
    if self.use_dropout:
      outputs = self.mlp(inputs, is_training=False)
    else:
      outputs = self.mlp(inputs)
    outputs = self.ln(outputs)
    return outputs
class OutputMLP(snt.Module):
  @tf.function()
  def __init__(self, global_output_size = 1, latent_size=64, dropout=0.05, activation=tf.nn.leaky_relu):
    super(OutputMLP, self).__init__(name=None)
    self.mlp = snt.nets.MLP([latent_size, global_output_size],
                            name='global_output', dropout_rate = dropout, w_init = None, b_init = None, activation = activation)
    self.use_dropout = (dropout != 0)
  @tf.function()
  def __call__(self, inputs):
    if self.use_dropout:
      outputs = self.mlp(inputs, is_training=False)
    else:
      outputs = self.mlp(inputs)
    outputs = tf.sigmoid(outputs)
    return outputs
 
def make_mlp_model(latent_size=64,num_layers=4,dropout=0.05,activation=tf.nn.leaky_relu):
  return MyMLP(latent_size,num_layers,dropout,activation)
class MLPGraphIndependent(snt.Module):
  """GraphIndependent with MLP edge, node, and global models."""
  @tf.function()
  def __init__(self):
    super(MLPGraphIndependent, self).__init__(name="MLPGraphIndependent")
    self._network = modules.GraphIndependent(
      edge_model_fn=make_mlp_model,
      node_model_fn=make_mlp_model,
      global_model_fn=make_mlp_model)
  @tf.function()
  def __call__(self, inputs):
    return self._network(inputs)
class OutputTransform(snt.Module):
  @tf.function()
  def __init__(self):
    super(OutputTransform, self).__init__(name="OutputTransform")
    self._network = modules.GraphIndependent(
      edge_model_fn = None,
      node_model_fn = None,
      global_model_fn = OutputMLP)
    
  @tf.function()
  def __call__(self, inputs):
    return self._network(inputs)
class MLPGraphNetwork(snt.Module):
  """GraphIndependent with MLP edge, node, and global models."""
  @tf.function()
  def __init__(self):
    super(MLPGraphNetwork, self).__init__(name="MLPGraphNetwork")
    self._network = modules.GraphNetwork(
      edge_model_fn=make_mlp_model,
      node_model_fn=make_mlp_model,
      global_model_fn=make_mlp_model)
  @tf.function()
  def __call__(self, inputs):
    return self._network(inputs)
class MLPAttentionNetwork(snt.Module):
  """SelfAttention with MLP edge, node, and global models."""
  @tf.function()
  def __init__(self):
    super(MLPAttentionNetwork, self).__init__(name="MLPAttentionNetwork")
    self._attn = modules.SelfAttention()
  @tf.function()
  def __call__(self, inputs):
    nodes = inputs.nodes
    return self._attn(nodes,nodes,nodes,inputs)
  
class GeneralClassifier(snt.Module):
    @tf.function()
    def __init__(self):
        super(GeneralClassifier, self).__init__(name="GeneralClassifier")
        self._encoder = MLPGraphIndependent()
        self._core    = MLPGraphNetwork()
        self._decoder = MLPGraphIndependent()
        # Transforms the outputs into appropriate shapes.
        self._output_transform = OutputTransform()
    @tf.function()
    def __call__(self, input_op, num_processing_steps):
      latent = self._encoder(input_op)
      latent0 = latent
      
      output_ops = []
      for _ in range(num_processing_steps):
        core_input = utils_tf.concat([latent0, latent], axis=1)
        latent = self._core(core_input)
        
      decoded_op = self._decoder(latent)
      output_ops.append(self._output_transform(decoded_op))
      return output_ops
class AttentionClassifier(snt.Module):
    @tf.function()
    def __init__(self):
        super(AttentionClassifier, self).__init__(name="AttentionClassifier")          
        self._encoder = MLPGraphIndependent()
        self._attn    = MLPAttentionNetwork()
        self._core    = MLPGraphNetwork()
        self._decoder = MLPGraphIndependent()
        # Transforms the outputs into appropriate shapes.
        self._output_transform = OutputTransform()
    @tf.function()
    def __call__(self, input_op, num_processing_steps):
      latent = self._encoder(input_op)
      latent0 = latent
      
      output_ops = []
      for _ in range(num_processing_steps):
        core_input = utils_tf.concat([latent0, latent], axis=1)
        latent = self._core( self._attn(core_input))
        
      decoded_op = self._decoder(latent)
      output_ops.append(self._output_transform(decoded_op))
      return output_ops
"""
Main executable for evaluating WupperFlow GNNs in the Arachne event-loop
""" 
def main(input_list, event_number):
  ...
      import time
      start = time.time()
      output = model(input_graphs_ntuple, nprocsteps)
      end = time.time()
      print("Eval time: "+str(end-start))
      start = time.time()
      output = model(input_graphs_ntuple, nprocsteps)
      end = time.time()
      print("Eval time: "+str(end-start))
  return output[0][4].numpy()[0][0]
I have left out the non-problematic parts, the main problem is that the call of output = model(input_graphs_ntuple, nprocsteps) takes too long, since the initialization takes some time. I call it twice here since I noticed this way that the second execution was way faster. but in the end I only want to call it once.
I have tried some optimizations of the tf code but this did not help enough, the main problem is the initialization.