doc/core_8h_source.html

 #ifndef _H_TIRAMISU_CORE_
 #define _H_TIRAMISU_CORE_

 #include <isl/set.h>
 #include <isl/map.h>
 #include <isl/union_map.h>
 #include <isl/union_set.h>
 #include <isl/ast_build.h>
 #include <isl/schedule.h>
 #include <isl/schedule_node.h>
 #include <isl/space.h>
 #include <isl/constraint.h>

 #include <map>
 #include <string.h>
 #include <stdint.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <sstream>

 #include <Halide.h>
 #include <tiramisu/debug.h>
 #include <tiramisu/expr.h>
 #include <tiramisu/type.h>
 #include "cuda_ast.h"

 namespace tiramisu
 {
 class view;
 class input;
 class function;
 class computation;
 class buffer;
 class constant;
 class generator;
 class computation_tester;
 class send;
 class recv;
 class send_recv;
 class wait;
 class sync;
 class xfer_prop;


 struct HalideCodegenOutput
 {
     std::map<std::string, tiramisu::computation *> computation_list;
     std::map<std::string, tiramisu::constant *> constant_list;
     std::map<std::string, tiramisu::buffer *> output_buffers;

     HalideCodegenOutput(const std::map<std::string, tiramisu::computation *> &computations,
                         const std::map<std::string, tiramisu::constant *> &constants,
                         const std::map<std::string, tiramisu::buffer *> &buffers)
         : computation_list(computations), constant_list(constants), output_buffers(buffers) {}
 };

 HalideCodegenOutput halide_pipeline_to_tiramisu_function(
     Halide::Internal::Stmt s,
     const std::vector<Halide::Internal::Function> &outputs,
     const std::map<std::string, Halide::Internal::Function> &env,
     const std::map<std::string, std::vector<int32_t>> &output_buffers_size,
     tiramisu::function *func);

 computation *get_computation_annotated_in_a_node(isl_ast_node *node);

 std::string generate_new_computation_name();

 isl_map *isl_map_add_free_var(const std::string &free_var_name, isl_map *map, isl_ctx *ctx);
 void split_string(std::string str, std::string delimiter, std::vector<std::string> &vector);
 int loop_level_into_dynamic_dimension(int level);
 int loop_level_into_static_dimension(int level);

 enum xfer_attr {
     SYNC,
     ASYNC,
     BLOCK,
     NONBLOCK,
     MPI,
     CUDA,
     CPU2CPU,
     CPU2GPU,
     GPU2CPU,
     GPU2GPU
 };

 struct xfer {
     tiramisu::send *s;
     tiramisu::recv *r;
     tiramisu::send_recv *sr;
 };

 /**
  * Loop level, loop start, loop end, number of elements to collapse
  */
 typedef std::tuple<int, tiramisu::expr, tiramisu::expr, tiramisu::expr> collapse_group;


 //*******************************************************

 /**
   * Initialize the Tiramisu compiler and set Tiramisu options to default
   * values.
   * \p fct_name is the name of the function to be generated by the Tiramisu compiler.
   * This is the name that the user will call to execute the generated code.
   */
 void init(std::string name);

 /**
   * \overload
   */
 void init();

 /**
   * \brief Generate code.
   *
   * \details
   *
   * This function generates the declared function and computations in an object file.
   * \p obj_filename is the relative path of the object file to be generated.
   * If \p gen_cuda_stmt is set to true, CUDA code is generated instead of code
   * targeting CPU (i.e., instead of generating Halide IR then LLVM IR).
   */
 void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const bool gen_cuda_stmt = false);

 //*******************************************************

 /**
   * A class to represent functions in Tiramisu. A function in Tiramisu is composed of
   * a set of computations (tiramisu::computation).
   */
 class function
 {
     // Friend classes.  They can access the private members of the "function" class.
     friend buffer;
     friend computation;
     friend constant;
     friend generator;
     friend tiramisu::wait;
     friend cuda_ast::generator;

 private:
     /**
       * The name of the function.
       * Function names should not start with _ (an underscore).
       * Names starting with _ are reserved names.
       */
     std::string name;

     /**
      * True if this requires a call to MPI_Comm_rank or something similar.
      */
     bool _needs_rank_call;

     /**
      * Offset the rank by this amount.
      */
     int rank_offset = 0;

     /**
       * Function arguments. These are the buffers or scalars that are
       * passed to the function.
       */
     std::vector<tiramisu::buffer *> function_arguments;

     /**
       * A vector representing the invariants of the function (symbolic
       * constants or variables that are invariant to the function i.e.
       * do not change their value during the execution of the function).
       */
     std::vector<tiramisu::constant> invariants;

     /**
       * An ISL context associate with the function.
       */
     isl_ctx *ctx;

     /**
       * An ISL AST representation of the function.
       * The ISL AST is generated by calling gen_isl_ast().
       */
     isl_ast_node *ast;

     /**
       * A vector representing the parallel dimensions around
       * the computations of the function.
       * A parallel dimension is identified using the pair
       * <computation_name, level>, for example the pair
       * <S0, 0> indicates that the loop with level 0
       * (i.e. the outermost loop) around the computation S0
       * should be parallelized.
       */
     std::vector<std::pair<std::string, int>> parallel_dimensions;

     /**
       * A vector representing the vectorized dimensions around
       * the computations of the function.
       * A vector dimension is identified using the tuple
       * <computation_name, level, length>, for example the tuple
       * <S0, 0, 4> indicates that the loop with level 0
       * (i.e. the outermost loop) around the computation S0
       * should be vectorized with a vector length 4.
       */
     std::vector<std::tuple<std::string, int, int>> vector_dimensions;

     /**
       * A vector representing the distributed dimensions around
       * the computations of the function.
       * A distributed dimension is identified using the pair
       * <computation_name, level>, for example the pair
       * <S0, 0> indicates that the loop with level 0
       * (i.e. the outermost loop) around the computation S0
       * should be distributed.
       */
     std::vector<std::pair<std::string, int>> distributed_dimensions;

     /**
       * A vector representing the GPU block dimensions around
       * the computations of the function.
       * GPU dimensions dimensions are dimensions that should be mapped
       * to parallel GPU dimensions.
       * GPU dimensions are identified using the tuple
       * <computation_name, level0, level1, level2>, for example the tuple
       * <S0, 0, 1, 2> indicates that the loops with levels 0, 1 and 2
       * (i.e. the three outermost loops) around the computation S0
       * should be mapped to GPU dimensions.
       * Level1 must be the level following level0, i.e.
       * level1 == level0 + 1
       * and level2 must be the level following level1
       */
     std::vector<std::pair<std::string, std::tuple<int, int, int>>> gpu_block_dimensions;

     /**
      * Similar to gpu_dimensions but used to store information about GPU thread dimensions.
      * i.e., dimensions that should mapped to threads (in CUDA terminology).
      */
     std::vector<std::pair<std::string, std::tuple<int, int, int>>> gpu_thread_dimensions;

     /**
       * A vector representing the dimensions that should be unrolled
       * around the computations of the function.
       * Unrolled dimensions are identified using the tuple
       * <computation_name, level0>, for example the tuple
       * <S0, 1> indicates that the loops with level 1
       * around the computation S0 should be unrolled.
       */
     std::vector<std::pair<std::string, int>> unroll_dimensions;

     /**
       * Body of the function (a vector of computations).
       * The order of the computations in the vector does not have any
       * effect on the actual order of execution of the computations.
       * The order of execution of computations is specified through the
       * schedule.
       */
     std::vector<computation *> body;

     /**
       * A Halide statement that represents the whole function.
       * This value stored in halide_stmt is generated by the code generator
       * and is empty before calling the code generator.
       */
     Halide::Internal::Stmt halide_stmt;

     /**
       * A map representing the buffers of the function. Some of these
       * buffers are passed to the function as arguments and some are
       * declared and allocated within the function itself.
       */
     std::map<std::string, tiramisu::buffer *> buffers_list;

     /**
      * The context set of the function, i.e. a set representing the
      * constraints over the parameters.
      * The parameters of a function are the function invariants (constants).
      */
     isl_set *context_set;

     /**
      * The names of the iterators.
      */
     std::vector<std::string> iterator_names;

     /**
      * Map from loop iterators to CUDA kernels.
      */
     std::unordered_map<isl_ast_node*, cuda_ast::kernel_ptr> iterator_to_kernel_map;

     std::shared_ptr<cuda_ast::compiler> nvcc_compiler;

     /**
       * Tag the dimension \p dim of the computation \p computation_name to
       * be parallelized.
       * The dimension 0 represents the outermost loop level (it
       * corresponds to the leftmost dimension in the iteration space).
       */
     void add_parallel_dimension(std::string computation_name, int vec_dim);

     /**
       * Tag the dimension \p dim of the computation \p computation_name to
       * be vectorized. \p len is the vector length.
       * The dimension 0 represents the outermost loop level (it
       * corresponds to the leftmost dimension in the iteration space).
       */
     void add_vector_dimension(std::string computation_name, int vec_dim, int len);

     /**
       * Tag the dimension \p dim of the computation \p computation_name to
       * be distributed.
       * The dimension 0 represents the outermost loop level (it
       * corresponds to the leftmost dimension in the iteration space).
       */
     void add_distributed_dimension(std::string computation_name, int dim);

     /**
       * Tag the loop level \p L of the computation
       * \p computation_name to be unrolled.
       * The dimension 0 represents the outermost loop level (it
       * corresponds to the leftmost dimension in the iteration space).
       */
     void add_unroll_dimension(std::string stmt_name, int L);

     /**
      * Get live in/out computations in the function.
      */
     // @{
     std::vector<tiramisu::computation *> get_live_in_computations();
     std::vector<tiramisu::computation *> get_live_out_computations();
     // @}

     /**
       * Recursive function to perform the DFS step of dump_sched_graph.
       */
     void dump_sched_graph_dfs(tiramisu::computation *,
                               std::unordered_set<tiramisu::computation *> &);

     /**
       * Recursive function to perform the DFS step of is_sched_graph_tree.
       */
     bool is_sched_graph_tree_dfs(tiramisu::computation *,
                                  std::unordered_set<tiramisu::computation *> &);
     /**
      * This functions iterates over the iteration domain of the computations
      * of the function and computes the maximum dimension among the dimensions
      * of these iteration domains.
      */
     int get_max_iteration_domains_dim() const;


     /**
      * This functions iterates over the schedules of the function (the schedule
      * of each computation in the function) and computes the maximal dimension
      * among the dimensions of the ranges of all the schedules.
      */
     int get_max_schedules_range_dim() const;

     /**
       * This function first computes the identity schedules,
       * then it computes the maximal dimension among the dimensions
       * of the ranges of all the identity schedules.
       */
     int get_max_identity_schedules_range_dim() const;

     /**
       * Get the trimmed time-processor domain of the function.
       * The first dimension of the time-processor domain is used
       * to indicate duplicates of the computation.  Computations that
       * do not have any duplicate have 0 in that dimension, whereas
       * computations that have duplicates (i.e., are recomputed) have
       * a number in that dimension to represent each duplicate.
       * The trimmed time-processor domain is the time-processor domain
       * without the dimension that represents the duplicates. We simply
       * take the time-processor domain and remove the first dimension
       * used to represent the duplicates.
       */
     isl_union_set *get_trimmed_time_processor_domain() const;

     /**
       * This function iterates over the computations of the function.
       * It modifies the identity schedule of each computation in order to
       * make all the identity schedules have the same number of dimensions
       * in their ranges.
       * This is done by adding dimensions equal to 0 to the range of each
       * identity schedule that does not have enough dimensions.
       */
     isl_union_map *get_aligned_identity_schedules() const;

     /**
      * A pass to rename computations.
      * Computation that are defined multiple times need to be renamed, because
      * those computations in general have different expressions and the code
      * generator expects that computations that have the same name always have
      * the same expression and access relation. So we should rename them to avoid
      * any ambiguity for the code generator.
      *
      */
     void rename_computations();


 protected:

     /**
       * Add a buffer to the function.
       * The buffers of the function are either:
       * - buffers passed to the function as arguments, or
       * - buffers that are declared and allocated within the function
       * itself.
       * The first element of the pair is the name of the buffer (it is
       * used as a key), the second element of the pair is a pointer
       * to the buffer.
       */
     void add_buffer(std::pair<std::string, tiramisu::buffer *> buf);

     /**
       * Add a computation to the function.  The order in which
       * computations are added to the function is not important.
       * The order of execution is specified using the schedule.
       * This doesn't allow computations with duplicate names.
       */
     void add_computation(computation *cpt);

     /**
       * Tag the dimensions \p dim0, \p dim1 and \p dim2 of the computation
       * \p computation_name to be mapped to GPU blocks.
       * The dimension 0 represents the outermost loop level (it
       * corresponds to the leftmost dimension in the iteration space).
       *
       * If the user does not want to tag \p dim1 or \p dim2, he can leave
       * their values to default value (i.e., -1).  They will not be tagged.
       *
       * For example
       *
       * add_gpu_block_dimensions("S0", 1, 2);
       *
       * Will tag the dimensions 1 and 2 to be transformed to GPU blocks.
       */
     void add_gpu_block_dimensions(std::string stmt_name, int dim0, int dim1 = -1, int dim2 = -1);

     /**
      * Tag the dimensions \p dim0, \p dim1 and \p dim2 of the computation
      * \p computation_name to be mapped to GPU threads.
      * The dimension 0 represents the outermost loop level (it
      * corresponds to the leftmost dimension in the iteration space).
      *
      * If the user does not want to tag \p dim1 or \p dim2, he can leave
      * their values to default value (i.e., -1).  They will not be tagged.
      *
      * For example
      *
      * add_gpu_block_dimensions("S0", 1, -1, -1);
      *
      * Will tag the dimension 1 to be transformed to GPU threads.
      */
     void add_gpu_thread_dimensions(std::string stmt_name, int dim0, int dim1 = -1, int dim2 = -1);

     /**
      * Add an invariant to the function.
      */
     void add_invariant(tiramisu::constant param);

     /**
       * Add an iterator to the function.
       */
     void add_iterator_name(const std::string &it_name);

     /**
       * Keeps track of allocation computations created using
       * allocate_and_map_buffer_automatically() to schedule them during gen_ordering_schedules.
       */
      std::vector<tiramisu::computation *> automatically_allocated;

     /**
       * Compute the graph of dependences between the computations of
       * the function.
       *
       * Example
       *
       * C[0] = 0
       * D[1] = C[0]
       * D[2] = C[0]
       * {C[0] -> D[1]; C[0]->D[2]}
       */
     isl_union_map *compute_dep_graph();

     /**
       * Get the arguments of the function.
       */
     const std::vector<tiramisu::buffer *> &get_arguments() const;

     /**
       * Return a map that represents the buffers of the function.
       * The buffers of the function are buffers that are either passed
       * to the function as arguments or are buffers that are declared
       * and allocated within the function itself.
       * The names of the buffers are used as a key for the map.
       */
     const std::map<std::string, tiramisu::buffer *> &get_buffers() const;

     /**
       * Return a vector of the computations of the function.
       * The order of the computations in the vector does not have any
       * effect on the actual order of execution of the computations.
       * The order of execution of computations is specified through the
       * schedule.
       */
     const std::vector<computation *> &get_computations() const;

     /**
       * Return the computation of the function that has
       * the name \p str.
       */
     std::vector<computation *> get_computation_by_name(std::string str) const;

     /**
       * Return a string representing the name of the GPU block iterator at
       * dimension \p lev0.
       * This function only returns a non-empty string if the
       * computation \p comp is mapped to a GPU block at the dimension \p lev0.
       */
     std::string get_gpu_block_iterator(const std::string &comp, int lev0) const;

     /**
       * Return a string representing the name of the GPU thread iterator at
       * dimension \p lev0.
       * This function only returns a non-empty string if the
       * computation \p comp is mapped to a GPU thread at the dimension \p lev0.
       */
     std::string get_gpu_thread_iterator(const std::string &comp, int lev0) const;

     /**
       * Return the isl_ctx associated with this function.
       * This is an ISL specific object required when calling certain
       * ISL functions.  It does not represent the set of parameters
       * of the function (which should be retrieved by calling
       * get_program_context()).
       */
     isl_ctx *get_isl_ctx() const;

     /**
       * Return the Halide statement that represents the whole
       * function.
       * The Halide statement is generated by the code generator.
       * This function should not be called before calling the code
       * generator.
       */
     Halide::Internal::Stmt get_halide_stmt() const;

     /**
       * Return a vector representing the invariants of the function
       * (symbolic constants or variables that are invariant to the
       * function i.e. do not change their value during the execution
       * of the function).
       *
       * get_invariant_names() returns the names of the variants.
       */
     //@{
     const std::vector<tiramisu::constant> &get_invariants() const;
     const std::vector<std::string> get_invariant_names() const;
     //@}

     /**
       * Return an ISL AST that represents this function.
       * This function itself does not generate the ISL AST, it just
       * returns it if it already exists.
       * The function gen_isl_ast() should be called before calling
       * this function.
       */
     isl_ast_node *get_isl_ast() const;

     /**
       * Return the union of all the iteration domains
       * of the computations of the function.
       */
     isl_union_set *get_iteration_domain() const;

     /**
       * Get the iterator names of the function.
       */
     const std::vector<std::string> &get_iterator_names() const;

     /**
       * Get the name of the function.
       */
     const std::string &get_name() const;

     /**
       * Return a set that represents the parameters of the function
       * (an ISL set that represents the parameters and constraints over
       * the parameters of the functions,  a parameter is an invariant
       * of the function). This set is also known as the context of
       * the program.
       * An example of a context set is the following:
       *          "[N,M]->{: M>0 and N>0}"
       * This context set indicates that the two parameters N and M
       * are strictly positive.
       */
     isl_set *get_program_context() const;

     /**
       * Return the union of all the schedules
       * of the computations of the function.
       */
     isl_union_map *get_schedule() const;

     /**
       * Return the union of all the trimmed schedules of the
       * function.
       * A trimmed schedule is the schedule without the duplication
       * dimension (the schedule dimension used to indicate duplicate
       * computations).
       */
     isl_union_map *get_trimmed_schedule() const;

     /**
       * Return the union of time-processor domains of each
       * computation in the function.
       * In the time-processor representation, the logical time of
       * execution and the processor where the computation will be
       * executed are both specified.
       */
     isl_union_set *get_time_processor_domain() const;

     /**
      * If the computation \p comp is vectorized, return its vector length
      * at the loop level \p lev.
      */
     int get_vector_length(const std::string &comp, int lev) const;

    /**
      * Return true if the usage of high level scheduling comments is valid; i.e. if
      * the scheduling relations formed using before, after, compute_at, etc.. form a tree.
      *
      * More specifically, it verifies that:
      *     - There should be exactly one computation with no computation scheduled before it.
      *     - Each other computation should have exactly one computation scheduled before it.
      */
     bool is_sched_graph_tree();

     /**
      * Modify the schedules of the computations of this function to reflect
      * the order specified using the high level scheduling commands.
      *
      * Commands like .after() and .before() do not directly modify the schedules
      * but rather modify the sched_graph graph.
      */
     void gen_ordering_schedules();

     /**
       * Stores all high level scheduling instructions between computations; i.e. if a user calls
       * for example c2.after(c1, L), sched_graph[&c1] would contain the key &c2, and
       * sched_graph[&c1][&c2] = L.
       * At the end of scheduling, the graph should respect the following rules:
       *     - There should be exactly one computation with no computation scheduled before it.
       *     - Each other computation should have exactly one computation scheduled before it.
       * In other words, the graph should be a valid tree.
       * Does not include allocation computations created using
       * allocate_and_map_buffer_automatically().
       */
     std::unordered_map<tiramisu::computation *,
     std::unordered_map<tiramisu::computation *, int>> sched_graph;

     /**
       * Same as sched_graph, except in reverse order (from after to before).
       */
     std::unordered_map<tiramisu::computation *,
     std::unordered_map<tiramisu::computation *, int>> sched_graph_reversed;

     /**
      * Set the iterator names of the function.
      * This function overrides any previously set iterator names.
      */
     void set_iterator_names(const std::vector<std::string> &it_names);

     /**
       * Return true if the computation \p comp should be mapped to GPU block
       * at the loop levels \p lev0.
       */
     bool should_map_to_gpu_block(const std::string &comp, int lev0) const;

     /**
       * Return true if the computation \p comp should be mapped to GPU thread
       * at the loop levels \p lev0.
       */
     bool should_map_to_gpu_thread(const std::string &comp, int lev0) const;

     /**
       * Return true if the computation \p comp should be parallelized
       * at the loop level \p lev.
       */
     bool should_parallelize(const std::string &comp, int lev) const;

     /**
       * Return true if the computation \p comp should be unrolled
       * at the loop level \p lev.
       */
     bool should_unroll(const std::string &comp, int lev) const;

     /**
       * Return true if the computation \p comp should be vectorized
       * at the loop level \p lev.
       */
     bool should_vectorize(const std::string &comp, int lev) const;

     /**
       * Return true if the computation \p comp should be distributed
       * at the loop level \p lev.
       */
     bool should_distribute(const std::string &comp, int lev) const;

     /**
       * This computation requires a call to the MPI_Comm_rank function.
       */
     bool needs_rank_call() const;

     /**
       * Lift certain computations for distributed execution to function calls.
       */
     void lift_mpi_comp(tiramisu::computation *comp);

     /**
       * Lift certain computations for distributed execution to function calls.
       */
     void lift_dist_comps();

     /**
       * The set of all computations that have no computation scheduled before them.
       * Does not include allocation computations created using
       * allocate_and_map_buffer_automatically().
       */
     std::unordered_set<tiramisu::computation *> starting_computations;

     /**
      * A boolean set to true if low level scheduling was used in the program.
      * If it is used, then high level scheduling commands such as .before(),
      * .after(), ...
      */
     bool use_low_level_scheduling_commands;

 public:

     /**
      * \brief Construct a function called \p name.
      * \details Function names should not start with _ (an underscore).
      * Names starting with _ are reserved names.
      */
     function(std::string name);

     /**
       * \brief Add a set of constraints to the context of the program.
       *
       * \details This command is useful for providing constraints over the constants
       * used within a tiramisu function.
       * The context of a function is represented as an ISL set that represents
       * constraints over the parameters of the function (a parameter of a
       * function is a constant used in that function).
       *
       * For example, if the constants M and N are known to be positive, it is beneficial
       * to provide such an information to the Tiramisu compiler as follows:
       *     f.add_context_constraints("[N,M]->{: M>0 and N>0}");
       * This context set indicates that the two parameters N and M
       * are strictly positive in the function f.
       *
       * \p new_context should have the same space as the context set.
       * This call intersects the set \p new_context
       * (input) with the context of the function.
       */
     void add_context_constraints(const std::string &new_context);

     /**
       * \brief Align the schedules of all the computations of this
       * function.
       *
       * This method applies to the schedule of each computation
       * in the function.  It makes the dimensions of the ranges of
       * all the schedules equal.  This is done by adding dimensions
       * equal to 0 to the range of each schedule.
       * This function is called automatically when gen_isl_ast()
       * or gen_time_processor_domain() are called.
       */
     void align_schedules();

     /**
      * \brief For each computation, allocate a buffer and map the computation
      * to that buffer.
      *
      * \details For each computation in the function:
      *  - Allocate a buffer where the size of the buffer is derived automatically.
      *    Assuming the name of the computation is C, the name of the generated buffer
      *    is _C_buffer.
      *  - Map the computation to the allocated buffer (one-to-one mapping).
      *    For more details about one-to-one mapping, see computation::store_in.
      */
     void allocate_and_map_buffers_automatically();

     /**
       * \brief Compute the bounds of each computation.
       *
       * \details Computing the bounds of each computation means computing
       * the constraints over the iteration domains of each computation in
       * the function.
       *
       * In order to deduce bounds, Tiramisu first identifies the final consumers
       * in the function (i.e., computations that do not have any consumer).
       * Then, it propagates the bounds over the final consumers to their producers.
       * The bounds of each consumer are used to deduce the bounds over its producer.
       *
       * To take benefit of bound inference, the user can declare computations
       * without providing constraints on their iteration domains. For example
       * the user can declare the following computations (the left side is the
       * iteration domain, while the right side is the expression attached to
       * each computation)
       *
       * \code
       * {A[i]        } : 0
       * {B[i]        } : 0
       * {C[i]        } : A[i] + B[i]
       * {D[i]: 0<=i<N} : 2*C[i]
       * \endcode
       *
       *
       * The user needs only to provide constraints over the domains of the
       * last computations (last consumers), and Tiramisu will propagate
       * these constraints to all the chain of computations that produce for
       * those consumers.
       * In the previous example, constraints over the iteration domain were
       * only provided for the last consumer "D[i]" and no constraints were
       * provided for the other computations.  Bound inference would deduce
       * the constraints for the computations A[i], B[i] and C[i].
       *
       * Note that bound inference is not possible if you have multiple definitions
       * of the same computation. For example, if you have multiple definitions
       * of the same computations, in such a case the user should provide
       * constraints of the iteration domain of the computation.  Example:
       *
       * \code
       * {A[i]        } : 0
       * {C[i]: i=0   } : 0
       * {C[i]: 1<=i<N} : C[i-1] + A[i]
       * {D[i]: 0<=i<N} : 2*C[i]
       * \endcode
       *
       * In this case, constraints over the computations defining C[i] should be
       * provided.
       */
     void compute_bounds();

     /**
       * \brief Dump the function on standard output (dump most of the fields of
       * tiramisu::function).
       * \details This is mainly useful for debugging.
       * If \p exhaustive is set to true, all the fields of the function
       * class are printed.
       */
     void dump(bool exhaustive) const;

     /**
      * \brief Dump the graph of dependences between computations.
      * \details The graph of dependences is a union of maps (relations) from producers
      * to consumers.
      */
     void dump_dep_graph();

     /**
       * \brief Dump a Halide stmt that represents the function.
       * \details tiramisu::function::gen_halide_stmt should be called before calling
       * this function.
       */
     void dump_halide_stmt() const;

     /**
       * \brief Dump the iteration domain of the function.
       * \details This is mainly useful for debugging.
       */
     void dump_iteration_domain() const;

     /**
       * \brief Dump the schedules of the computations of the function.
       * \details This function is mainly useful for debugging.
       * See tiramisu::computations::set_low_level_schedule for details about the
       * schedule.
       */
     void dump_schedule() const;

     /**
       * \brief Dumps the graph of scheduling relations set by the higher level scheduling
       * functions (e.g. after, before, compute_at...).
       * \details This is mainly useful for debugging.
       * This function can be called at any point during scheduling.
       */
     void dump_sched_graph();

     /**
       * \brief Dump (on stdout) the time processor domain of the function.
       * \details The time-processor domain should be generated using
       * tiramisu::function::gen_time_processor_domain before calling this
       * function.
       * This is mainly useful for debugging.
       */
     void dump_time_processor_domain() const;

     /**
       * \brief Dump (on stdout) the trimmed time processor domain of the function.
       * \details The time-processor domain should be generated using
       * tiramisu::function::gen_time_processor_domain before calling this function.
       * This is mainly useful for debugging.
       * The difference between the time-processor domain and the trimmed
       * time-processor domain is that the trimmed one does not have the
       * duplicate dimension.  We remove it before printing.
       * The trimmed time-processor domain is the domain used for code
       * generation.
       */
     void dump_trimmed_time_processor_domain() const;

     /**
       * \brief Generate C code and print it on stdout.
       * \details Currently C code code generation is very basic and does not
       * support many features compared to the Halide code generator.
       * Use this for debugging only.
       */
     void gen_c_code() const;

     /**
       * \brief Generate an object file that contains the compiled function.
       * \details This function relies on Halide to generate the object file.
       *
       * \p obj_file_name indicates the name of the generated file.
       *
       * \p os indicates the target operating system (Halide::Target::OS).
       *
       * \p arch indicates the architecture of the target (the instruction set).
       *
       * \p bits indicate the bit-width of the target machine.
       *    must be 0 for unknown, or 32 or 64.
       * For a full list of supported values for \p os and \p arch please
       * check the documentation of Halide::Target
       * (http://halide-lang.org/docs/struct_halide_1_1_target.html).
       * If the machine parameters are not supplied, Halide detects
       * the parameters of the host machine automatically.

       */
     void gen_halide_obj(const std::string &obj_file_name, Halide::Target::OS os,
                         Halide::Target::Arch arch, int bits) const;

     /**
       * \overload
       */
     void gen_halide_obj(const std::string &obj_file_name) const;

     /**
       * Generate a Halide stmt that represents the function.
       */
     void gen_halide_stmt();

     void gen_cuda_stmt();

     /**
       * Generate an isl AST that represents the function.
       */
     void gen_isl_ast();

     /**
       * Generate the time-space domain of the function.
       *
       * In this representation, the logical time of execution and the
       * processor where the computation will be executed are both
       * specified.
       */
     void gen_time_space_domain();

     /**
       * Set the arguments of the function.
       * The arguments of the function are provided as a vector of
       * pointers to buffers. Each buffer represents an argument
       * to the function.
       * During code generation, the arguments in the vector will become
       * the arguments of the generated function (with the order of their
       * appearance in the vector).
       */
     void set_arguments(const std::vector<tiramisu::buffer *> &buffer_vec);

     /**
      * Wrapper for all the functions required to run code generation of a
      * tiramisu program.
      */
     void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const bool gen_cuda_stmt = false);

     /**
      * \brief Set the context of the function.
      * \details A context is an ISL set that represents constraints over the
      * parameters of the functions (a parameter is an invariant variable for
      * the function).
      * An example of a context set is the following:
      *          "[N,M]->{: M>0 and N>0}"
      * This context set indicates that the two parameters N and M
      * are strictly positive.
      *
      * This function takes a string that represents and ISL set.
      */
     void set_context_set(const std::string &context);

     /**
       * \overload
       *
       * This function takes an ISL set as input.
       */
     void set_context_set(isl_set *context);

 };


 /**
   * A class that represents buffers.
   *
   * Buffers have two use cases:
   * - used to store the results of computations, and
   * - used to represent input arguments to functions.
   */
 class buffer
 {
     friend computation;
     friend function;
     friend generator;
     friend cuda_ast::generator;

 private:
     /**
      * A boolean that indicates whether a buffer is allocated or not.
      */
     bool allocated;

     /**
       * Type of the argument (if the buffer is an argument):
       * Three possible types:
       *  - a_input: for inputs of the function,
       *  - a_output: for outputs of the function,
       *  - a_temporary: for buffers used as temporary buffers within
       *  the function (any temporary buffer is allocated automatically by
       *  the Tiramisu runtime at the entry of the function and is
       *  deallocated at the exit of the function).
       */
     tiramisu::argument_t argtype;

     /**
      * A boolean indicating whether the buffer should be allocated
      * automatically by Tiramisu.
      */
     bool auto_allocate;

     /**
       * The sizes of the dimensions of the buffer.  Assuming the following
       * buffer buf[N0][N1][N2], dim_sizes should be {N0, N1, N2}.
       */
     std::vector<tiramisu::expr> dim_sizes;

     /**
       * The tiramisu function where this buffer is declared or where the
       * buffer is an argument.
       */
     tiramisu::function *fct;

     /**
       * The name of the buffer.
       * Buffer names should not start with _ (an underscore).
       * Names starting with _ are reserved names.
       */
     std::string name;

     /**
       * The type of the elements of the buffer.
       */
     tiramisu::primitive_t type;

     /**
      * The location of the buffer (host if in memory, else where on GPU).
      */
     cuda_ast::memory_location location;

 protected:
     /**
      * Set the type of the argument. Three possible types exist:
      *  - a_input: for inputs of the function,
      *  - a_output: for outputs of the function,
      *  - a_temporary: for buffers used as temporary buffers within
      *  the function (any temporary buffer is allocated automatically by
      *  the Tiramisu runtime at the entry of the function and is
      *  deallocated at the exit of the function).
      */
     void set_argument_type(tiramisu::argument_t type);

     /**
       * Return whether the buffer should be allocated automatically.
       */
     bool get_auto_allocate();

     /**
      * Set the size of a dimension of the buffer.
      */
     void set_dim_size(int dim, int size);

 public:
     /**
       * \brief Create a tiramisu buffer.
       *
       * \details
       *
       * A Tiramisu buffer is equivalent to an array in C.
       *
       * Buffers have two use cases:
       * - Used to store the results of computations, and
       * - Used to represent input arguments to functions.
       *
       * \p name is the name of the buffer.
       *
       * \p dim_sizes is a vector of tiramisu expressions that represent the
       * size of each dimension in the buffer.
       * Assuming we want to declare the buffer buf[N0][N1][N2],
       * then the vector of sizes should be {N0, N1, N2}.
       * Buffer dimensions in Tiramisu have the same semantics as in
       * C/C++.
       *
       * \p type is the type of the elements of the buffer.
       * It must be a primitive type (i.e. p_uint8, p_uint16, ...).
       * Possible types are declared in \ref tiramisu::primitive_t
       * (in type.h).
       *
       * \p argt indicates whether this buffer is an input or an output
       *  buffer and whether it should be allocated automatically by Tiramisu.
       *  The possible types (\ref tiramisu::argument_t) are:
       *
       *  - tiramisu::a_input: indicates that the buffer is supposed
       * to be passed as input to the function generated by Tiramisu.
       *
       *  - tiramisu::a_output: indicates that the buffer is supposed
       * to be passed as output to the function generated by Tiramisu.
       * Input an output buffers should be allocated by the user
       * before calling the Tiramisu generated function.  Tiramisu
       * does not allocated memory for input and output buffers.
       *
       *  - tiramisu::a_temporary: this buffer is not supposed to
       * be passed to the function as an argument.  It will be
       * allocated automatically by the Tiramisu compiler at
       * the beginning of the function and deallocated at the end
       * of the function.  The user can also allocate the buffer
       * "manually" by setting the automatic allocation to false for
       * this buffer.
       *
       * \p fct is a pointer to a Tiramisu function where the buffer is
       * declared or used.  If this argument is not provided (which is
       * the common case), the function that was created automatically
       * during Tiramisu initialization will be used (we call that
       * function the "implicit function").
       *
       * Buffer names should not start with _ (an underscore).
       * Names starting with _ are reserved names.
       */
     buffer(std::string name, std::vector<tiramisu::expr> dim_sizes,
            tiramisu::primitive_t type, tiramisu::argument_t argt,
        tiramisu::function *fct = global::get_implicit_function());

     /**
      * \brief Indicate when to allocate the buffer (i.e., the schedule).
      *
      * \details The buffer is allocated in the same loop of the computation \p C
      * at the loop level \p level (but the order between the two is not
      * specified).
      *
      * For example, let's assume that buf0 is a buffer, and let's assume
      * that we have three computations C1, C2 and C3 scheduled as follow
      *
      * \code
      * for (i=0; i<N; i++)
      *      for (j=0; j<N; j++)
      *           for (k=0; k<N; k++)
      *              C1;
      *
      * for (i=0; i<N; i++) // level 0
      *      for (j=0; j<N; j++) // level 1
      *           for (k=0; k<N; k++) // level 2
      *           {
      *              C2;
      *              C3;
      *           }
      * \endcode
      *
      * The following Tiramisu code
      *
      * \code
      * tiramisu::computation *C4 = buf0.allocate_at(C2, j);
      * C4->before(C2, j);
      * \endcode
      *
      * would allocate buf0 in the loop surrounding C2 at the loop
      * level 0. The allocation computation is called C4, where
      * C4 is scheduled to execute before C2 at the loop level 1.
      * The generated code would look like the following code:
      *
      * \code
      * for (i=0; i<N; i++)
      *      for (j=0; j<N; j++)
      *           for (k=0; k<N; k++)
      *              C1;
      *
      * for (i=0; i<N; i++) // level 0
      *      for (j=0; j<N; j++) // level 1
      *      {
      *           allocate(buf0, buffer_size, buffer_type);
      *           for (k=0; k<N; k++) // level 2
      *           {
      *              C2;
      *              C3;
      *           }
      *      }
      * \endcode
      *
      */
     //@{
     tiramisu::computation *allocate_at(tiramisu::computation &C, tiramisu::var level);
     tiramisu::computation *allocate_at(tiramisu::computation &C, int level);
     //@}

     /**
       * \brief Dump the function on standard output.
       * \details This functions dumps most of the fields of the buffer class
       * but not all of them.  It is mainly useful for debugging.
       * If \p exhaustive is set to true, all the fields of the buffer
       * class are printed.
       */
     void dump(bool exhaustive) const;

     /**
       * \brief If this buffer is an argument to a tiramisu::function,
       * return the type of the argument
       *
       * \details Three possible types exist:
       *  - tiramisu::a_input: for inputs of the function,
       *  - tiramisu::a_output: for outputs of the function,
       *  - tiramisu::a_temporary: for buffers used as temporary buffers within
       *  the function (any temporary buffer is allocated automatically by
       *  the Tiramisu runtime at the entry of the function and is
       *  deallocated at the exit of the function).
       */
     tiramisu::argument_t get_argument_type() const;

     /**
       * Return the name of the buffer.
       */
     const std::string &get_name() const;

     /**
       * Get the number of dimensions of the buffer.
       */
     int get_n_dims() const;

     /**
       * Return the type of the elements of the buffer.
       */
     tiramisu::primitive_t get_elements_type() const;

     /**
       * Return the sizes of the dimensions of the buffer.
       */
     const std::vector<tiramisu::expr> &get_dim_sizes() const;

     /**
       * Set whether the buffer should be allocated automatically.
       */
     void set_auto_allocate(bool auto_allocation);

     /**
      * Return true if all extents of the buffer are literal integer
      * contants (e.g., 4, 10, 100, ...).
      */
     bool has_constant_extents();

     /**
      * Return true if a statement that allocates the buffer was
      * already generated.
      */
     const bool is_allocated() const;

     /**
      * Mark an array as already allocated.
      */
     void mark_as_allocated();

     /* Tag the buffer as located in the GPU global memory. */
     void tag_gpu_global();
     /** Tag the buffer as located in the GPU register memory.
       * The buffer needs to have a unique dimension of size 1 (i.e. needs to be a scalar).*/
     void tag_gpu_register();
     /* Tag the buffer as located in the GPU shared memory. */
     void tag_gpu_shared();
     /* Tag the buffer as located in the GPU local thread memory. */
     void tag_gpu_local();
     /* Tag the buffer as located in the GPU constant memory. */
     void tag_gpu_constant();

 };

 /**
   * \brief A class that represents computations.
   *
   * \details A computation is an expression associated with an iteration domain.
   * A computation indicates what needs to be computed (the expression
   * that should be computed).
   * A computation has three representations:
   * - Level I: this level specifies "what" should be computed but does
   *   not specify "when" (order) and "where" (on which processor) each
   *   expression should be computed.
   *   This level also does not specify where computations should be stored
   *   in memory and in which data layout.
   * - Level II: this level specifies "what" should be computed, "when", i.e.
   *   the order in which the computation should be executed with regard to
   *   the other computations. And "where" each computation should be
   *   computed (i.e., on which processor).
   *   This level still does not specify where computations should be stored
   *   in memory and their data layout.
   * - Level III: this level is similar to Level 2 but it specifies where
   *   computations should be stored in memory and the data layout.
   */
 class computation
 {
     friend input;
     friend function;
     friend generator;
     friend buffer;
     friend constant;
     friend sync;
     friend computation_tester;
     friend send;
     friend recv;
     friend tiramisu::wait;
     friend cuda_ast::generator;

 private:

     /**
       * Access function.  A map indicating how each computation should be stored
       * in memory.  It indicates in which buffer the computation should be stored
       * and which element of the buffer exactly it should be stored.
       */
     isl_map *access;

     /**
       * A vector that contains the list of let statements associated
       * with this computation.
       *
       * A let statement that is associated with the computation is a let statement
       * that will be added just before the computation.  The scope of the variable
       * defined by the let statement is this computation alone. i.e., it is not
       * defined in other computations.
       */
     std::vector<std::pair<std::string, tiramisu::expr>> associated_let_stmts;

     /**
      * The buffer attached "automatically" to this computation.
      * If the buffer is not created automatically, this variable will be empty.
      */
     tiramisu::buffer *automatically_allocated_buffer;

     /**
       * An ISL context associate with the function.
       */
     isl_ctx *ctx;

     /**
       * Data type: the type of the value returned by the computation.
       */
     tiramisu::primitive_t data_type;

     /**
       * Number of definitions added to this computation. Each time the function
       * add_definitions is called, definitions_number is incremented.
       */
     int definitions_number;

     /**
       * The ID of this definition. Each new computation when created has a
       * definition_ID equal to 0.  When a new definition is added, its ID
       * is 1, when a new definition is added, its definition ID is set to 2,
       * ...
       */
     int definition_ID;

     /**
      * An integer indicating the number of duplicates of this computation.
      * We use this to set the duplicate ID of any newly created computation.
      * Whenever a new duplicate is create, this number is incremented.
      */
     int duplicate_number;

     /**
       * An expression representing the computation
       * ("what" should be computed).
       */
     tiramisu::expr expression;

     /**
       * If has_multiple_definitions() is true, then this variable contains the
       * computation that was defined first among all the multiple definitions.
       */
     tiramisu::computation *first_definition;

     /**
       * The function where this computation is declared.
       */
     tiramisu::function *fct;

     /**
       * An isl_ast_expr representing the index of the array where the computation
       * will be stored.  This index is computed after the scheduling is done.
       */
     std::vector<isl_ast_expr *> index_expr;

     /**
      * A map between the original names of the iterators of a computation
      * and their transformed form after schedule (also after renaming).
      *
      * If in the original computation, we had
      *
      * {C[i0, i1]: ...}
      *
      * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
      * the loops are tiled, then the map will be
      *
      * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
      */
     std::map<std::string, isl_ast_expr *> iterators_map;

     /**
       * Does this computation represent a let statement ?
       *
       * Let statements should be treated differently:
       * - During Halide code generation a Halide let statement should be
       * created instead of an assignment statement.
       * - A let statement does not have/need an access function because
       * it writes directly to a scalar.
       * - When targeting Halide, let statements should be created after
       * their body is created, because the body is an argument needed
       * for the creation of the let statement.
       */
     bool is_let;

     /**
       * This is variable is set to true if this computation is the first definition.
       * It is set to false if has_multiple_definitions() is true but this computation
       * is not the first one defined.
       * This is useful because in tiramisu, we assume that all the computations that
       * have the same name must have the same memory access. Thus any new definition
       * of a computation must copy the same memory access as the first definition, thus
       * we need to know which computation is the first definition.
       */
     bool is_first;

     /**
       * Return true if this computation is the first definition.
       * It returns false if has_multiple_definitions() is true but this computation
       * is not the first one defined.
       * This is useful because in tiramisu, we assume that all the computations that
       * have the same name must have the same memory access. Thus any new definition
       * of a computation must copy the same memory access as the first definition, thus
       * we need to know which computation is the first definition.
       */
     bool is_first_definition();

     /* Is true if the the computation is inline. */
     bool is_inline;

     /**
       * Iteration domain of the computation.
       * In this representation, the order of execution of computations
       * is not specified, the computations are also not mapped to memory.
      */
     isl_set *iteration_domain;

     /**
       * The name of this computation.
       * Computation names should not start with _ (an underscore).
       * Names starting with _ are reserved names.
       */
     std::string name;

     /* The number of dimensions in the original definition of the computation. */
     int number_of_dims;

     /**
      * A predicate around the computation. The computation is executed
      * only if this predicate is true. This is useful to insert a non-affine
      * condition around the computation.
      */
     tiramisu::expr predicate;

     /**
       * The schedules of the computation.
       */
     isl_map * schedule;

     /**
       * TODO: use buffers directly from computations, no need to have
       * bindings.
       *
       * \p schedule_this_computation should be set to true when the computation
       * should be scheduled and when code for the computation should be generated
       * during code generation.
       * It should be set to false when the computation is used to represent a
       * buffer (i.e., the computation is used only as a binding to a buffer).
       * In this case, the computation is not scheduled and no code for the
       * computation is generated.
       */
     bool schedule_this_computation;

     // Private class members that are computed during code generation.

     /**
       * Halide statement that assigns the computation to a buffer location.
       */
     Halide::Internal::Stmt stmt;

     /**
       * Time-processor domain of the computation.
       * In this representation, the logical time of execution and the
       * processor where the computation will be executed are both
       * specified.
       */
     isl_set *time_processor_domain;

     /**
       * True if the computation is executed in a nonblocking or asynchronous way.
       */
     bool _is_nonblock_or_async;

     /**
      * True if  the rank should be dropped from index linearization.
      */
     bool _drop_rank_iter;

     /**
      * If _drop_rank_iter == true, this is the level to drop
      */
     var drop_level;

     /**
       * If the computation represents a library call, this will contain the
       * necessary arguments to the function.
       */
     std::vector<tiramisu::expr> library_call_args;

     /**
       * If the computation represents a library call and requires a linear index (for either a LHS or RHS access),
       * this indicates which argument to the function represents the linear index.
       */
     // @{
     int lhs_argument_idx;
     int rhs_argument_idx;
     // @}

     /**
       * If the computation represents a library call and requires a wait, this indicates which argument to the function
       * represents the linear index.
       */
     int wait_argument_idx;

     /**
       * If the computation represents a library call and requires a wait, this is like a regular access map, but gives
       * an access into a special wait buffer.
       */
     isl_map *wait_access_map = nullptr;

     /**
       * By default, this becomes an o_access to signify that we are writing into a location. However, it can be changed
       * to something like o_address_of to indicate that we don't want to do a store to the location, rather we just
       * want to get a pointer to the store location.
       */
     // @{
     tiramisu::op_t lhs_access_type;
     tiramisu::op_t rhs_access_type;
     // @}

     /**
       * Apply a transformation on the domain of the schedule.
       * This is a transformation from iteration domain to the time-processor
       * domain.
       *
       * For example, to apply to shift the i dimension of the iteration domain
       * of C0, you can apply the transformation
       *
       * C0[i, j] -> C0[i+2, j]
       */
     void apply_transformation_on_schedule_domain(std::string map_str);

     /**
       * Add the set of constraints \p domain_constraints to the domain
       * of the schedule and add the set of constraints \p range_constraints
       * to the range of the schedule.
       *
       * \p domain_constraints and \p range_constraints are both ISL sets in
       * the ISL string format.
       */
     void add_schedule_constraint(std::string domain_constraints, std::string range_constraints);

     /**
       * Check that the names used in \p dimensions are not already
       * in use.
       */
     void assert_names_not_assigned(std::vector<std::string> dimensions);

     /**
       * Return true if a buffer was allocated to this computation or to one
       * of its updates (we assume that we allocate the same buffer for the
       * computation and its updates).
       */
     bool buffer_already_allocated();

     /**
       * Identical to
       *     void before(computation &consumer, tiramisu::var L);
       * Except that the loop level in this case is identified using its number. This is
       * used mainly internally by Tiramisu while the other is designed to be user friendly.
       */
     void before(computation &consumer, int L);

     /**
       * Check that the \p dimensions are valid:
       * - The dimension numbers are within the bounds of accepted dimensions
       * (i.e., between computation::root_dimension and the maximal dimension
       * number in the time-space domain.
       */
     void check_dimensions_validity(std::vector<int> dimensions);

     /**
      * Compute two subsets of computations:
      *  - the first is the subset of needed computations,
      *  - the second is the subset of produced computations,
      *
      * The needed computations are those that need to be consumed
      * by the \p consumer if this computation is to be executed in the
      * same loop as the consumer but at loop level \p L.
      *
      * The produced subset represent are the computations produced by
      * this computation at the level \p L.
      *
      * This function takes in consideration the schedule of
      * this computation and the schedule of consumer in order to
      * compute the needed area.
      *
      * This function creates new parameters, these parameters are used to specialize
      * the needed area for specific loop iterators (i.e., fix the needed area).
      * These parameters are stored in \p param_names.
      * This vector is used internally by Tiramisu in compute_at() which calls this
      * function.
      */
     std::vector<isl_set *> compute_needed_and_produced(computation &consumer, int L,
                                                        std::vector<std::string> &param_names);


     /**
       * Take a list of iterators and a name as input and construct the iteration
       * domain that iterates over those iterators.  The name of the statement
       * of the iteration domain is \p name.
       * The iteration domain is a string in ISL format.
       */
     std::string construct_iteration_domain(std::string name, std::vector<var> iterator_variables);

     /**
       * Create a copy of this computation.
       */
     tiramisu::computation *copy();

     /**
       * Create a Halide statement that assigns the computations to the memory
       * buffer and location specified by the access function.
       */
     void create_halide_assignment();

     /**
       * Returns a pair of expressions (LHS, RHS) representing the assignment performed
       * by the computation.
       * The LHS would represent a memory buffer access, and the RHS would represent
       * the value of the assignment.
       */
     std::pair<expr, expr> create_tiramisu_assignment(std::vector<isl_ast_expr *> &index_expr);

     /**
       * Apply a duplication transformation from iteration space to
       * time-processor space.
       * A duplication transformation duplicates the original computation,
       * so the domain of the schedule has to be the iteration domain of
       * the original computation.
       *
       * For example, to duplicate C0 into a first duplicate:
       *
       * C0[i, j] -> C0[1, 0, i, 0, j, 0]
       *
       * To duplicate C0 again
       *
       * C0[i, j] -> C0[2, 0, j, 0, i, 0]
       *
       */
     void create_duplication_transformation(std::string map_str);

     /*
      * Create a new Tiramisu constant M = v*floor(N/v) and use it as
      * a separator.
      *
      * The separator is used to separate a computation. That
      * is, it is used to create two identical computations where we have
      * a constraint like i<M in the first and i>=M in the second.
      * The first is called the full computation while the second is called
      * the separated computation.
      *
      * This function is used in vectorize and unroll mainly.
      */
     tiramisu::constant *create_separator(const tiramisu::expr &loop_upper_bound, int v);

     /**
        * Duplicate a part of this computation (or all of it) and return
        * the duplicate computation.  The duplicate computation is identical
        * to this computation in every aspect except that its iteration domain
        * is the intersection of the iteration domain of the original computation
        * and the domain provided in \p domain_constraints.
        *
        * Example: let us assume that you have the following computation
        *
        * {C0[i,j]: 0<=i<N and 0<=j<N}
        *
        * If you want to create a duplicate of this computation
        * that has the following iteration domain
        *
        *          "{C0[i,j]: 0<=i<=10 and 0<=j<=5"
        *
        * you can write
        *
        * C0.duplicate("{C0[i,j]: 0<=i<=10 and 0<=j<=5");
        *
        * This will keep the original computation C0 and will create a
        * new computation (duplicate of C0) that has
        * "{C0[i,j]: 0<=i<=10 and 0<=j<=5". as iteration domain.
        * (duplicate() in fact intersects the domain provided with
        * the original domain).
        *
        * The duplicate computation is an exact copy of the original
        * computation except in one thing:
        * The iteration domain of the duplicate computation is
        * the intersection of the iteration domain of the original
        * computation with the constraints provided as an argument
        * to the duplicate command; this means that the iteration domain
        * of the duplicate computation is always a subset of the original
        * iteration domain;
        *
        * If you schedule a computation C, then duplicate it, then the duplicate
        * will have exactly the same schedule as the original computation.
        * After duplication, any schedule applied on the original computation
        * will not be applied automatically on the duplicate computation. They
        * become two separate computations that need to be scheduled separately.
        *
        * \p domain_constraints is a set of constraints on the iteration domain that
        * define the duplicate.
        * \p range_constraints is a set of constraints on the time-processor domain
        * that define the duplicate.
        */
      tiramisu::computation *duplicate(std::string domain_constraints,
              std::string range_constraints);

     /**
       * Return the access function of the computation.
       */
     isl_map *get_access_relation() const;

     /**
       * Return the access function of the computation after transforming
       * it to the time-processor domain.
       * The domain of the access function is transformed to the
       * time-processor domain using the schedule, and then the transformed
       * access function is returned.
       */
     isl_map *get_access_relation_adapted_to_time_processor_domain() const;

     /**
       * Return vector of associated let statements.
       *
       * This is a vector that contains the list of let statements
       * associated with this computation.
       *
       * A let statement that is associated with the computation is a
       * let statement that will be added just before the computation.
       * The scope of the variable defined by the let statement is this
       * computation alone. i.e., it is not defined in other computations.
       */
     const std::vector<std::pair<std::string, tiramisu::expr>> &get_associated_let_stmts() const;

     /**
      * Get the name of dynamic dimension that corresponds to the
      * \p loop_level in the time-space domain.
      */
     std::string get_dimension_name_for_loop_level(int loop_level);

     /**
       * Return the number of the duplicates of this computation.
       *
       * The number of duplicates is incremented if the computation is duplicated using
       * the duplicate() function.
       */
     int get_duplicates_number() const;

     /**
       * If has_multiple_definitions() is true, then this function returns the
       * computation that was defined first among all the multiple definitions.
       */
     tiramisu::computation *get_first_definition();

     /**
       * Return the function where the computation is declared.
       */
     tiramisu::function *get_function() const;

     /**
       * Return the Halide statement that assigns the computation to a buffer location.
       * Before calling this function the user should first call Halide code generation
       * (function::gen_halide_stmt()).
       */
     Halide::Internal::Stmt get_generated_halide_stmt() const;

     /**
       * Return vector of isl_ast_expr representing the indices of the array where
       * the computation will be stored.
       */
     std::vector<isl_ast_expr *> &get_index_expr();

     /**
      * Get the union of the iteration domains of this computations and
      * all the other definitions of this computations (updates,
      * duplicates, ...).
      */
     isl_set *get_iteration_domains_of_all_definitions();


     /**
      * The iterators map is map between the original names of the iterators of a computation
      * and their transformed form after schedule (also after renaming).
      *
      * If in the original computation, we had
      *
      * {C[i0, i1]: ...}
      *
      * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
      * the loops are tiled, then the map will be
      *
      * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
      */
     std::map<std::string, isl_ast_expr *> get_iterators_map();

     /**
       * Return the names of iteration domain dimensions.
       */
     std::vector<std::string> get_iteration_domain_dimension_names();

     /**
       * Get the number of dimensions of the iteration
       * domain of the computation.
       */
     int get_iteration_domain_dimensions_number();

     /**
       * Return the names of loop levels.
       * (i.e., the names of dynamic dimensions in time-space).
       */
     std::vector<std::string> get_loop_level_names();

     /**
       * Get the number of loop levels.
       */
     int get_loop_levels_number();

     /**
      * Gets the span of the loop level \p level (number of iterations) and returns
      * it as a tiramisu::expr.
      */
     expr get_span(int level);

     /**
       * Return the root of the tree of definitions of this computation.
       * The tree of definitions is the tree that emerges after adding multiple
       * definitions to the same computation. Let's take the following example.
       * Assuming we have the computation c0.  We can add a definition c1 to c0.
       * The tree in this case would have two nodes, c0 and c1. c0 is the root.
       * We can add another defintion c2 to c0. Then we can add another defintion c3
       * to c1. In this case the tree would have four nodes where c0 is the root.
       */
     tiramisu::computation *get_root_of_definition_tree();

     /**
       * Get the number of dimensions of the time-space
       * domain of the computation.
       */
     int get_time_space_dimensions_number();

     /**
       * Trim the union of schedules of the computation and
       * return the result.
       * The trimmed union of schedules is the schedule without the
       * duplicate dimension (the dimension used to indicate the
       * duplicate ID).
       */
     isl_map *get_trimmed_union_of_schedules() const;

     /**
       * Return the time-processor domain of the computation.
       * In this representation, the logical time of execution and the
       * processor where the computation will be executed are both
       * specified.
       */
     isl_set *get_time_processor_domain() const;

     /**
       * Return the trimmed time-processor domain.
       * The first dimension of the time-processor domain is used
       * to indicate redundancy of the computation.  Computations that
       * are not redundant have 0 in that dimension, whereas
       * computations that are redundant (i.e., are recomputed) have
       * a number different from 0 in that dimension and which represents
       * the ID of the redundant computation.
       * The trimmed time-processor domain is the time-processor domain
       * without the dimension that represents the redundancy.  We simply
       * take the time-processor domain and remove the first dimension.
       */
     isl_set *get_trimmed_time_processor_domain();

     /**
       * Generate an identity schedule for the computation.
       *
       * This identity schedule is an identity relation created from the iteration
       * domain.
       */
     isl_map *gen_identity_schedule_for_iteration_domain();

     /**
       * Generate an identity schedule for the computation.
       *
       * This identity schedule is an identity relation created from the
       * time-processor domain.
       */
     isl_map *gen_identity_schedule_for_time_space_domain();

     /**
       * Returns all updates the have been defined for this computation using
       * add_definitions. The 0th update is a pointer to this computation.
       */
     std::vector<computation*>& get_updates();

     /**
       * Search the time-space domain (the range of the schedule) and
       * return the loop level numbers that correspond to the dimensions
       * named \p dim.
       * In other words, translate the vector of dimension names (\p dim_names)
       * into loop level numbers. We need to do this because the internal Tiramisu
       * scheduling functions use dimension numbers instead of dimension
       * names (which are used in the user level scheduling functions).
       */
     std::vector<int> get_loop_level_numbers_from_dimension_names(std::vector<std::string> dim_names);

     /**
      * Return true if this computation is supposed to have an access to other
      * computations.
      * Knowing this is important so that tiramisu does not try to compute
      * the accesses of the RHS.
      */
     bool has_accesses() const;

     /**
       * Return if this computation represents a let statement.
       *
       * Let statements should be treated differently because:
       * - A let statement does not have/need an access function because
       * it writes directly to a scalar.
       * - If the backend is Halide:
       *      - During Halide code generation a Halide let statement
       *      should be created instead of an assignment statement.
       *      - When targeting Halide, let statements should be created
       *      after their body is created, because the body is an argument
       *      needed for the creation of the let statement.
       */
     bool is_let_stmt() const;

     /**
       * Return true if this computation represents a call to a library function?
       * Mainly used for distributed communication functions.
       */
     bool is_library_call() const;

     /**
       * Return true if the rank loop iterator should be removed from linearization.
       */
     bool should_drop_rank_iter() const;

     int get_level_to_drop();

     /**
       * Assign a name to iteration domain dimensions that do not have a name.
       */
     void name_unnamed_iteration_domain_dimensions();

     /**
       * Assign a name to iteration domain dimensions that do not have a name.
       */
     void name_unnamed_time_space_dimensions();

     /**
       * Set an identity schedule for the computation.
       *
       * This identity schedule is an identity relation created from the iteration
       * domain.
       *
       * This sets the schedule of the original computation
       * and does not set the schedule of any duplicate
       * computation.
       */
     void set_identity_schedule_based_on_iteration_domain();

     /**
       * Return true if the this computation is supposed to be scheduled
       * by Tiramisu.
       */
     bool should_schedule_this_computation() const;

     /**
       * Intersect \p set with the context of the computation.
       */
     // @{
     isl_set *intersect_set_with_context(isl_set *set);
     isl_map *intersect_map_domain_with_context(isl_map *map);
     // @}

     /**
      * Rename this computation and modify the schedule and the access relation
      * accordingly.
      *
      * Computation that are defined multiple times need to be renamed, because
      * those computations in general have different expressions and the code
      * generator expects that computations that have the same name always have
      * the same expression and access relation. So we should rename them to avoid
      * any ambiguity for the code generator.
      */
     void rename_computation(std::string new_name);

     /**
       * Separate the iteration domain into two iteration domains using
       * the constant \p C.
       * Let us assume that the dimension \p dim of the iteration domain
       * is called i.  The iteration domain is separated into two domains
       * using the hyperplane (i = v*floor(N/v)). That means, two copies of the
       * iteration domain are created, the constraint (i<=v*floor(N/v)) is added to
       * the schedule of the first while the constrain (i>v*floor(N/v)) is added to
       * the schedule of the second.
       *
       * Let us assume that we have the following iteration domain
       *
       *   {S0[i,j]: 0<=i<N and 0<=j<M}
       *
       * To separate this iteration domain by the hyperplane j=4*floor(M/4), one should
       * call
       *
       *   S0.separate(1, tiramisu::expr("M"), 4)
       *
       * This will result in the creation of two computations that have the same
       * iteration domains but have different schedules. The schedules are as
       * follows:
       *
       * The schedule of the original (full) computation would be
       * {S0[i,j]->S0[0, 0, i, 0, j, 0]: j<4*floor(M/4)}
       *
       * The schedule of the separated (partial) computation would be
       * {S0[i]->S0[0, 0, i, 10, j, 0]: 4*floor(M/4)<=j}
       *
       * The second computation created using separate can be accessed with
       * get_update().
       */
     void separate(int dim, tiramisu::expr N, int v);

     /**
       * Like separate, but the precise separate points are specified in _separate_points. _max is the loop max.
       */
     void separate_at(var level, std::vector<tiramisu::expr> _separate_points, tiramisu::expr _max);

     /**
       * Separate then split the loop \p L0
       * (see tiramisu::computation::separate and tiramisu::computation::split)
       *
       * This function returns true if the split actually happened (in some
       * cases, if the loop cannot be split because it is too small for example,
       * then no split happens even after calling the split function, in such
       * cases the function returns false).
       */
     //@{
     bool separateAndSplit(tiramisu::var L0, int sizeX);
     bool separateAndSplit(tiramisu::var L0, int sizeX,
                           tiramisu::var L0_outer, tiramisu::var L0_inner);
     bool separateAndSplit(int L0, int sizeX);
     //@}

     /**
       * Split the loop level \p L0 of the iteration space into two
       * new loop levels.
       *
       * \p sizeX is the extent (size) of the inner loop created after
       * splitting.
       */
     //@{
     //@}

     /**
       * Set the names of loop levels dimensions.
       * The loop levels are specified using \p loop_levels
       * and their names are specified using \p names.
       * Users can only set the names of loop levels (dynamic dimensions),
       * the static dimension names are set to default names.
       */
     void set_loop_level_names(std::vector<int> loop_levels, std::vector<std::string> names);

     /**
       * Set the names of loop level dimensions.
       * The loop level names are specified using \p names.
       * Users can only set the names of loop levels (dynamic dimensions),
       * the static dimension names are set to default names.
       */
     void set_loop_level_names(std::vector<std::string> names);

     /**
       * Set the names of the dimensions of the schedule domain.
       * The dimension names are specified using \p names, their positions
       * are indicated using \p loop_levels.
       */
     void set_schedule_domain_dim_names(std::vector<int> loop_levels, std::vector<std::string> names);

     /**
       * Set the iteration domain of the computation
       */
     void set_iteration_domain(isl_set *domain);

     /**
      * Set whether a computation has multiple definitions.
      * i.e., whether the computation is defined multiple times (i.e.,
      * whether there are many computations with the same name). An
      * update is a special case of a computation defined multiple
      * times.
      */
     void set_has_multiple_definitions(bool val);

     /**
      * The iterators map is map between the original names of the iterators of a computation
      * and their transformed form after schedule (also after renaming).
      *
      * If in the original computation, we had
      *
      * {C[i0, i1]: ...}
      *
      * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
      * the loops are tiled, then the map will be
      *
      * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
      */
     void set_iterators_map(std::map<std::string, isl_ast_expr *> map);

     /**
       * Identical to
       *      void shift(tiramisu::var L0, int n);
       */
     void shift(int L0, int n);

     /**
       * Simplify \p set using the context and by calling
       * set coalescing.
       */
     // @{
     isl_set *simplify(isl_set *set);
     isl_map *simplify(isl_map *map);
     // @}

     /**
       * Identical to
       * \code
       *    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1);
       *    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1,
       *                       tiramisu::var L2, tiramisu::var L3);
       *    void tag_gpu_level(tiramisu::var L0, tiramisu::var L1,
       *                       tiramisu::var L2, tiramisu::var L3,
       *                       tiramisu::var L4, tiramisu::var L5);
       * \endcode
       * The outermost loop level is 0.
       */
     // @{
     void tag_gpu_level(int L0, int L1);
     void tag_gpu_level(int L0, int L1, int L2, int L3);
     void tag_gpu_level(int L0, int L1, int L2, int L3, int L4, int L5);
     // @}

     /**
       * Tag the loop level \p L0 and \p L1 to be mapped to GPU block
       * dimensions.
       *
       * The outermost loop level is 0.
       */
     // @{
     void tag_gpu_block_level(int L0);
     void tag_gpu_block_level(int L0, int L1);
     void tag_gpu_block_level(int L0, int L1, int L2);
     // @}

     /**
       * Tag the loop level \p L0 and \p L1 to be mapped to GPU thread
       * dimensions.
       *
       * The outermost loop level is 0.
       */
     // @{
     void tag_gpu_thread_level(int L0);
     void tag_gpu_thread_level(int L0, int L1);
     void tag_gpu_thread_level(int L0, int L1, int L2);
     // @}

     /**
       * Contains a list of all definitions added to this computation. The 0th definition is
       * always this very computation.
       */
     std::vector<tiramisu::computation *> updates;

     /**
       * Update loop level names. This function should be called after each scheduling operation
       * because scheduling usually changes the original loop level names.
       * This function erases \p nb_loop_levels_to_erase loop level names starting from the
       * loop level \p start_erasing. It then inserts the loop level names \p new_names in
       * \p start_erasing. In other words, it replaces the names of loop levels from
       * \p start_erasing to \p start_erasing + \p nb_loop_levels_to_erase with the loop levels
       * indicated by \p new_names.  This function sets the non erased loop levels to be equal to the
       * original loop level names.
       *
       * \p original_loop_level_names : a vector containing the original loop level names (loop level
       * names before scheduling).
       *
       * \p new_names : the new loop level names.
       *
       * \p start_erasing : start erasing loop levels from this loop level.
       *
       * \p nb_loop_levels_to_erase : number of loop levels to erase.
       *
       * Example. Assuming the original loop levels are {i0, i1, i2, i3}
       *
       * Calling this->update_names({i0, i1, i2, i3}, {x0, x1}, 1, 2) updates the loop levels to become
       * {i0, x0, x1, i3}.
       */
     void update_names(std::vector<std::string> original_loop_level_names, std::vector<std::string> new_names,
                       int start_erasing, int nb_loop_levels_to_erase);

     /**
       * A vector describing the access variables in the original definition of  a computation.
       * For every named dimension, a pair representing the index of the named dimension
       * and the name of the dimension is added to access_variables.
       * E.g. if a computation is defined as S[i, 0, j], access_variables will contain
       * {(0, "i"), (2, "j")}.
       */
     std::vector<std::pair<int, std::string>> access_variables;

     /**
       * Compare two computations.
       *
       * Two computations are considered to be equal if they have the
       * same name.
       */
     bool operator==(tiramisu::computation comp1);

 protected:

     /**
       * True if this computation represents a library call.
       */
     bool _is_library_call;

     /**
       * If the computation represents a library call, this is the name of the function.
       */
     std::string library_call_name;

     /**
       * An index expression just for the request buffer.
       */
     isl_ast_expr *wait_index_expr;

     /**
       * Dummy constructor for derived classes.
       */
     computation();

     /**
       * Compute the size of the buffer allocated automatically to hold the
       * results of this computation.
       */
     std::vector<tiramisu::expr>* compute_buffer_size();

     /**
       * Return the context of the computations.
       */
     isl_ctx *get_ctx() const;

     /**
      * Return the predicate around this computation if a predicate
      * was added using add_predicate().
      */
     tiramisu::expr get_predicate();

     /**
       * Return a unique name of computation; made of the following pattern:
       * [computation name]@[computation address in memory]
       */
     const std::string get_unique_name() const;

     /**
      *
      * Return true if the computation has multiple definitions.
      * i.e., if the computation is defined multiple times.
      * An update is a special case where a computation is defined
      * multiple times.  Duplicate computations are another example.
      *
      * In the following example, C is defined multiple times whereas
      * D is defined only once.
      *
      * \code
      *
      * C(0) = 0
      *
      * C(i) = C(i-1) + 1
      *
      * D(i) = C(i) + 1
      *
      * \endcode
      */
     bool has_multiple_definitions();

     /**
       * Initialize a computation.
       * This is a private function that should not be called explicitly
       * by users.
       */
     void init_computation(std::string iteration_space_str,
                           tiramisu::function *fct,
                           const tiramisu::expr &e,
                           bool schedule_this_computation,
                           tiramisu::primitive_t t);

     /**
       * Set the name of the computation.
       */
     void set_name(const std::string &n);

     /**
       * Set the schedule indicated by \p map.
       *
       * \p map is a string that represents a mapping from the iteration domain
       * to the time-space domain (the ISL format to represent maps is
       * documented in http://barvinok.gforge.inria.fr/barvinok.pdf in Sec 1.2.2).
       *
       * The schedule is a map from the iteration domain to a time space
       * domain. The same name of space should be used for both the range
       * and the domain of the schedule.
       *
       * In general, users can set the schedule using high level functions such
       * as before(), after(), tile(), compute_at(), vectorize(), split(), ...
       * The use of this function is only reserved for advanced users who want
       * a low level control of the schedule.
       *
       * Vectors in the time-space domain have the following form
       *
       * computation_name[redundancy_ID,static,dynamic,static,dynamic,static,dynamic,static,...]
       *
       * The first dimension of the vector is used to indicate the redundancy ID
       * (the notion of the redundancy ID is explained later).
       *
       * The following dimensions are interleaved dimensions: static, dynamic, static,
       * dynamic, ...
       * Dynamic dimensions represent the loop levels, while static dimensions are
       * used to order statements within a given block of statements in a given loop
       * level.
       * For example, the computations c0 and c1 in the following loop nest
       *
       * \code
       * for (i=0; i<N: i++)
       *   for (j=0; j<N; j++)
       *   {
       *     c0;
       *     c1;
       *   }
       *
       * \endcode
       *
       * have the following representations in the iteration domain
       *
       * \code
       * {c0[i,j]: 0<=i<N and 0<=j<N}
       * {c1[i,j]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * and the following representation in the time-space domain
       *
       * \code
       * {c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * {c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * The first dimension (dimension 0) in the time-space
       * domain (the leftmost dimension) is the redundancy ID
       * (in this case it is 0, the meaning of this ID will be explained later).
       * The second dimension (starting from the left) is a static dimension,
       * the third dimension is a dynamic dimension that
       * represents the loop level i, ..., the fifth dimension is a dynamic
       * dimension that represents the loop level j and the last dimension
       * (dimension 5) is a static dimension and allows the ordering of
       * c1 after c0 in the loop nest.
       *
       * To transform the previous iteration domain to the
       * time-space domain, the following schedule should be used:
       *
       * \code
       * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * {c1[i,j]->c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * The first dimension called "redundancy ID" is only meaningful if the
       * computation is redundant. i.e., some parts of the computation are
       * redundantly computed.  Redundant computations are in general used to
       * maximize parallelism and data locality on the expense of doing some
       * computations redundantly.
       * For example, if the two computations c1(i,j) and c2(i,j) both depend
       * on the computation c0(i,j), instead of waiting for c0(i,j) to be
       * computed and then computing c1(i,j) and c2(i,j) in parallel, the thread
       * executing c1(i,j) can compute c0(i,j) by itself and then run c1(i,j).
       * The thread that computes c2(i,j) can do the same and compute c0(i,j)
       * by itself and then compute c2(i,j). In this case the two threads do not
       * need to wait. This is done at the expense of redundant computation since
       * c0(i,j) is computed by both threads.
       *
       * In general redundant computations are useful when tiling stencil
       * computations.  In the context of stencils such a tiling is called
       * "overlapped tiling".  Tiles that depend on results computed by other
       * tiles that run in parallel can compute the boundaries redundantly which
       * allows them to avoid waiting and thus can run in parallel.
       *
       * In Tiramisu, the user can indicate that a chunk of a computation
       * should be computed redundantly. The original computation always has a redundancy
       * ID equal to 0 (which means this is the original computation).
       * The redundant chunk has an ID that is different from 0 and that is
       * used to uniquely identify it.
       *
       * For example if we want to compute all of c0 three times (that is,
       * compute the original computation and compute two redundant computations),
       * we can use the following schedules:
       *
       * The schedule of the original computation:      {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * The schedule of the redundant computation N°1: {c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * The schedule of the redundant computation N°2: {c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       *
       * The schedule of c0 in this case would be three maps that map c0[i,j] to
       * the three different redundant computations in the time-processor domain:
       *
       * \code
       * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N;
       *  c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N;
       *  c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
       * \endcode
       *
       * The function set_schedule() overrides any other schedule set by the high level
       * scheduling functions.  Currently the user has to choose between using the high
       * level scheduling functions or using this low level set_schedule function. The user
       * cannot mix the use of the two in the same program because they are not compatible.
       */
     // @{
     void set_schedule(isl_map *map);
     void set_schedule(std::string map_str);
     // @}

     /**
       * Collapse all the iterations of a loop into one single iteration.
       */
     void full_loop_level_collapse(int level, tiramisu::expr collapse_from_iter);
     /**
       * \overload
       */
     computation(std::string name,std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation, primitive_t t);


 public:

     /**
       * \brief Constructor for computations.
       *
       * \details \p iteration_domain is a string that represents the iteration
       * domain of the computation.  The iteration domain should be written
       * in the ISL format (http://barvinok.gforge.inria.fr/barvinok.pdf Section 1.2.1).
       *
       * The iteration domain of a statement is a set that contains
       * all of the execution instances of the statement (a statement in a
       * loop has an execution instance for each loop iteration in which
       * it executes). Each execution instance of a statement in a loop
       * nest is uniquely represented by an identifier and a tuple of
       * integers  (typically, the values of the outer loop iterators).
       *
       * For example, the iteration space of the statement S0 in the following
       * loop nest
       *
       * \code
       * for (i=0; i<2; i++)
       *   for (j=0; j<3; j++)
       *      S0;
       * \endcode
       *
       * is {S0[0,0], S0[0,1], S0[0,2], S0[1,0], S0[1,1], S0[1,2]}
       *
       * S0[0,0] is the execution instance of S0 in the iteration [0,0].
       *
       * The previous set of integer tuples can be compactly described
       * by affine constraints as follows
       *
       * {S0[i,j]: 0<=i<2 and 0<=j<3}
       *
       * In general, the loop nest
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<M; j++)
       *      S0;
       * \endcode
       *
       * has the following iteration domain
       *
       * {S0[i,j]: 0<=i<N and 0<=j<M}
       *
       * This should be read as: the set of points [i,j] such that
       * 0<=i<N and 0<=j<M.
       *
       * The name of the computation in the iteration domain should not
       * start with _ (an underscore).  Names starting with _ are reserved
       * names.
       *
       * \p e is the expression computed by the computation.
       * It is possible to declare the computation without specifying the expression.
       * The expression can be specified later using computation::set_expression().
       * An example of setting the expression after declaring the computation
       * is presented in tests/test_04.cpp.
       *
       * \p schedule_this_computation should be set to true if the computation
       * is supposed to be schedule and code is supposed to be generated from
       * the computation.  Set it to false if you just want to use the
       * computation to represent a buffer (that is passed as an argument
       * to the function) and you do not intend to generate code for the
       * computation.
       * An example where this argument is set to false is presented in
       * tests/test_14.cpp.
       *
       * \p t is the type of the computation, i.e. the type of the expression
       * computed by the computation. Example of types include (p_uint8,
       * p_uint16, p_uint32, ...).
       *
       * \p fct is a pointer to the Tiramisu function where this computation
       * should be added.
       *
       * Bound Inference:
       * The user can declare computations without providing any constraint
       * about the iteration domain, in this case he can rely on bound inference
       * to infer the constraints about each iteration domain. The user needs only
       * to provide constraints over the domains of the last computations (last
       * consumers), and Tiramisu will propagate these constraints to all the
       * chain of computations that precede those consumers.
       * Note that bound inference is not possible if you have multiple definitions
       * of the same computation. In such a case, you should provide constraints
       * over the iteration domain when you declare the computation.
       *
       * Examples about bound inference are provided in test_22 to test_25.
       */
     computation(std::string iteration_domain, tiramisu::expr e,
                 bool schedule_this_computation, tiramisu::primitive_t t,
                 tiramisu::function *fct);

     /**
       * \brief Constructor for computations.
       *
       * \details
       *
       * Same as \ref tiramisu::computation::computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e)
       * except that is has the following additional argument:
       *
       * \p schedule_this_computation indicates whether this computation should to
       * be scheduled.
       */
    computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation);

    /**
      * \overload
      */
    computation(std::vector<var> iterator_variables, tiramisu::expr e);


     /**
       * \brief Constructor for computations.
       *
       * \details
       *
       * Declare a computation.  A computation in Tiramisu is the equivalent of a
       * a statement surrounded by a loop nest in C (an example is provided later).
       *
       * \p name is the name of the computation.
       *
       * \p iterator_variables is a vector that represents the loop iterators
       * around the computation.
       *
       * \p e is the expression computed by the computation.
       *
       * For example, if we have two iterator variables
       *
       * \code
       * var i("i", 0, 20), j("j", 0, 30);
       * \endcode
       *
       * and we have the following computation declaration
       *
       * \code
       * computation S("S", {i,j}, 4);
       * \endcode
       *
       * This is equivalent to writing the following C code
       *
       * \code
       * for (i=0; i<20; i++)
       *   for (j=0; j<30; j++)
       *      S(i,j) = 4;
       * \endcode
       *
       * More precisely, the vector {i, j} specifies the iteration domain of
       * the computation S0. In this case, 0<=i<20 and 0<=j<30.
       *
       * It is possible to declare the computation without specifying the expression.
       * The expression can be specified later using computation::set_expression().
       * An example of setting the expression after declaring the computation
       * is presented in tutorial 04.  Usually this is needed for writing
       * reductions.
       *
       */
    computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e);

    /**
      * \overload
      */
    computation(std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation);


     /**
       * \overload
       *
       * \p t is the type of the computation, i.e. the type of the expression
       * computed by the computation. Example of types include (p_uint8,
       * p_uint16, p_uint32, ...).
       *
       * Usually, this constructor is used to declare buffer wrappers.
       *
       * For example, if we have two iterator variables
       *
       * \code
       * var i("i", 0, 20), j("j", 0, 30);
       * \endcode
       *
       * and we have the following computation declaration
       *
       * \code
       * computation S("S", {i,j}, p_uint8);
       * \endcode
       *
       * This can be used a wrapper on a buffer buf[20, 30] where the buffer elements
       * are of type uint8.
       */
     computation(std::string name, std::vector<var> iterator_variables, primitive_t t):
             computation(name, iterator_variables, expr())
     {
         this->data_type = t;
         this->expression.dtype = t;
     }

     /**
       * \overload
       */
      computation(std::vector<var> iterator_variables, primitive_t t):
             computation(iterator_variables, expr())
     {
         this->data_type = t;
         this->expression.dtype = t;
     }

     virtual bool is_send() const;

     virtual bool is_recv() const;

     virtual bool is_send_recv() const;

     virtual bool is_wait() const;

     /**
        * \brief Add a let statement that is associated to this computation.
        * \details The let statement will be executed before the computation
        * (more precisely, between this computation and any computation that
        * preceeds it). The variable defined by the let statement can be
        * accessible by this computation alone. i.e., it cannot be used
        * in any other computation.
        */
     void add_associated_let_stmt(std::string access_name, tiramisu::expr e);

     /**
      * Don't scheduled a previously scheduled computation
      */
     void unschedule_this_computation();

     /**
      * \brief Add definitions of computations that have the same name as this
      * computation.
      *
      * \details The arguments of this function are identical to the arguments of
      * the computation constructor.  In general, this function is used to express
      * reductions and to express computation updates.
      *
      * In other words, this function should be used if the user has already declared
      * a set of computations C and wants to declare more computations that have the
      * same name.
      *
      * Example: Let's assume we want to declare the following two computations.
      *
      * \code
      * // First computation
      * {C[i]: 0<=i<10}: 0
      *
      * // Second computation
      * {C[i]: 10<=i<20}: 1
      * \endcode
      *
      * To do this this, we can declare the first computation using the computation
      * constructor and declare the second computation using add_definitions().
      *
      * The use of add_computation is purely due to restrictions imposed by the C++ language
      * and not by the Tiramisu framework itself.  This is mainly because in C++, it is not
      * possible to declare two objects with the same name, for example one cannot do
      *
      * computation C(...);
      * computation C(...);
      *
      * In order to declare the second set of computations, we chose to use the add_definitions
      * function to avoid this problem.
      *
      * The newly added computations must have the same name and the same access function
      * as the initial set of computations but can have a different expression.
      *
      * An example of using this function is available in test_26.
      *
      */
     virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                  bool schedule_this_computation, tiramisu::primitive_t t,
                                  tiramisu::function *fct);

     /**
      * \brief Add a predicate (condition) on the computation. The computation will be executed
      * only if this condition is true.
      *
      * \details The predicate can be an affine or a non-affine expression.
      * If you need to put a condition around a block of statements (i.e., a sequence of computations),
      * then you can perform that by adding a predicate to each one of those computations.
      * The compiler will then transform automatically the multiple conditions (condition around each
      * computation) into one condition around the whole block.
      */
     void add_predicate(tiramisu::expr predicate);

     /**
       * \brief Schedule this computation to run after the computation \p comp.
       *
       * \details This computation is placed after \p comp in the loop level \p level.
       * \p level is a loop level in this computation.
       *
       * The root level is computation::root.  The global variable
       * computation::root is equivalent to var("root").
       *
       * For example assuming we have the two computations
       *
       *     {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
       *
       * In order to make S1 run after S0 in the i loop, one should use
       *
       *     S1.after(S0, i)
       *
       * which means: S1 is after S0 at the loop level i (which is loop level 0).
       *
       * The corresponding code is
       *
       * \code
       *     for (i=0; i<N; i++)
       *     {
       *         for (j=0; j<N; j++)
       *             S0;
       *         for (j=0; j<N; j++)
       *             S1;
       *     }
       * \endcode
       *
       * S1.after(S0, j)
       *
       * means: S1 is after S0 at the loop level j (which is 1) and would yield
       * the following code
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *   {
       *     S0;
       *     S1;
       *   }
       * \endcode
       *
       * S1.after(S0, computation::root)
       * means S1 is after S0 at the main program level and would yield
       * the following code
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *     S0;
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *     S1;
       * \endcode
       *
       * Note that as with all other scheduling methods:
       *     - Calling this method with the same computations overwrites the level if it is
       *     higher.
       *     - A computation being scheduled after another computation at level L means it is
       *     scheduled after that computation at all levels lower than L.
       *     - There should be exactly one computation with no computation scheduled before it.
       *     - Each other computation should have exactly one computation scheduled before it.
       *
       */
     void after(computation &comp, tiramisu::var iterator);

     /**
       * This function is equivalent to
       *     void after(computation &comp, tiramisu::var iterator);
       * except that it uses loop level numbers (0, 1, 2, ...) instead of using loop variables
       * (tiramisu::var).  Tiramisu internally represent loop levels using numbers instead
       * of variable names, and this is the actual function used internally.
       *
       * The outermost loop level is 0.  The root level is computation::root_dimension.
       *
       * For example assuming we have the two computations
       *
       *     {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
       *
       * In order to make S1 run after S0 in the i loop, one should use
       *
       *     S1.after(S0,0)
       *
       * which means: S1 is after S0 at the loop level 0 (which is i).
       *
       * The corresponding code is
       *
       * \code
       *     for (i=0; i<N; i++)
       *     {
       *         for (j=0; j<N; j++)
       *             S0;
       *         for (j=0; j<N; j++)
       *             S1;
       *     }
       * \endcode
       */
     void after(computation &comp, int level);

     /**
       * Schedule this computation to run after the computation \p comp.
       * The computations are placed after each other in the loop level \p level.
       * The outermost loop level is 0.  The root level is computation::root_dimension.
       *
       * For example assuming we have the two computations
       *
       *     {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
       *
       * In order to make S1 run after S0 in the i loop, one should use
       *
       *     S1.after_low_level(S0,0)
       *
       * which means: S1 is after S0 at the loop level 0 (which is i).
       *
       * The corresponding code is
       *
       * \code
       *     for (i=0; i<N; i++)
       *     {
       *         for (j=0; j<N; j++)
       *             S0;
       *         for (j=0; j<N; j++)
       *             S1;
       *     }
       * \endcode
       *
       * S1.after_low_level(S0,1)
       *
       * means: S1 is after S0 at the loop level 1 (which is j) and would yield
       * the following code
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *   {
       *     S0;
       *     S1;
       *   }
       * \endcode
       *
       * S1.after_low_level(S0, computation::root_dimension)
       * means S1 is after S0 at the main program level and would yield
       * the following code
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *     S0;
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *     S1;
       * \endcode
       *
       * To specify that this computation is after \p comp in multiple levels,
       * the user can provide those levels in the \p levels vector.
       *
       * S1.after_low_level(S0, {0,1})
       *
       * means that S1 is after S0 in the loop level 0 and in the loop level 1.
       *
       * Note that
       *
       * S1.after_low_level(S0, L)
       *
       * would mean that S1 and S0 share the same loop nests for all the loop
       * levels that are before L and that S1 is after S0 in L only.  S1 is not
       * after S0 in the loop levels that are before L.
       *
       */
     // @{
     void after_low_level(computation &comp, int level);
     void after_low_level(computation &comp, std::vector<int> levels);
     // @}
     /*
      * \brief Allocate a buffer for the computation automatically.  The size of the buffer
      * is deduced automatically and a name is assigned to it automatically.
      *
      * \details Assuming the name of the computation is C, the name of the generated buffer
      * is _C_buffer.
      *
      * \p type is the type of the argument. Three possible types exist:
      *  - a_input: for inputs of the function,
      *  - a_output: for outputs of the function,
      *  - a_temporary: for buffers used as temporary buffers within
      *  the function (any temporary buffer is allocated automatically by
      *  the Tiramisu runtime at the entry of the function and is
      *  deallocated at the exit of the function).
      */
     void allocate_and_map_buffer_automatically(tiramisu::argument_t type = tiramisu::a_temporary);

     /**
       * \brief Apply a transformation on the schedule.
       *
       * \details This transformation is from
       * the time-space domain to the time-space domain.  It is applied on
       * the range of the schedule (i.e., on the output of the schedule relation).
       *
       * For example, to shift the i dimension of the time-processor domain
       * of C0, you can apply the transformation
       *
       * C0[0, 0, i, 0, j, 0] -> C0[0, 0, i+2, 0, j, 0]
       *
       * To apply an interchange, you would do
       *
       * C0[0, 0, i, 0, j, 0] -> C0[0, 0, j, 0, i, 0]
       */
     void apply_transformation_on_schedule(std::string map_str);

     /**
       * \brief Schedule this computation to run before the computation \p consumer
       * at the loop level \p L.
       *
       * \details Notes
       *     - The loop level \p L is a loop level of this computation.
       *     - Use computation::root to indicate the root dimension
       *       (i.e. the outermost time-space dimension).
       *     - Calling this method with the same computations overwrites the level if it is
       *     higher.
       *     - A computation being scheduled after another computation at level L means it is
       *     scheduled after that computation at all levels lower than L.
       *     - There should be exactly one computation with no computation scheduled before it.
       *     - Each other computation should have exactly one computation scheduled before it.
       */
     // @{
     void before(computation &consumer, tiramisu::var L);
     // @}

     /**
       * Schedule this computation to run after \p before_comp at the loop level \p before_l,
       * and before \p after_comp at loop level \p after_l. The outermost loop level is 0.
       *
       * Use computation::root_dimension to indicate the root dimension
       * (i.e. the outermost time-space dimension).
       *
       * If there was already a direct scheduling between \p before_comp and \p after_comp
       * (e.g. using before, after, between...), that schedule is overwritten; i.e. it no longer
       * exists/has an effect.
       *
       * Note that as with all other scheduling methods:
       *     - Calling this method with the same computations overwrites the levels if they are
       *     higher.
       *     - A computation being scheduled after another computation at level L means it is
       *     scheduled after that computation at all levels lower than L.
       *     - There should be exactly one computation with no computation scheduled before it.
       *     - Each other computation should have exactly one computation scheduled before it.
       */
     void between(computation &before_comp, tiramisu::var before_l, computation &after_comp, tiramisu::var after_l);

     /**
        * \brief Store this computation in \p buff
        *
        * \details
        *
        * Let us assume that we have a computation C:
        *
        * \code
        * {C[i]: 0<=i<N}
        * \endcode
        *
        * and that we want to store each C(i) in bufC[i]. Then we
        * can use store_in() to indicate that as follows:
        *
        * \code
        * C.store_in(&bufC)
        * \endcode
        *
        * This mans that each computation C(i) will be stored
        * in the buffer location bufC[i].
        *
        * If \p iterators is specified, the \p iterators are used to specify how the
        * computation is mapped to the buffer.
        * If the dimensions of this computation are in0, in1, ..., inn and if
        * \p iterators are equal to im0, im1, ..., imm then the computation is
        * mapped as follows
        *
        * \code
        * C[in0, in1, ..., inn]->bufC[im0, im1, ..., imm].
        * \endcode
        *
        * i.e., the computation C[in0, in1, ..., inn] is stored in bufC[im0,
        * im1, ..., imm].
        *
        * This can be used to store the data in many ways (reordering the
        * storage, storing into modulo buffers, ...).
        *
        * Assuming we have have computation D(i,j) that has the following
        * iteration domain:
        *
        * \code
        * {D[i,j]: 0<=i<N and 0<=j<N}
        * \endcode
        *
        * and assuming we have a buffer bufD.
        *
        * The store_in() function can be used to implement many types of data mappings:
        *    - Store the computation D to a scalar: D.store_in(&bufD, {}).
        *      This mans that D(i) will be stored in bufD[0] (which represents a
        *      scalar).
        *    - Store a 2 dimensional computation into a 1-dimensional
        *    buffer: D.store_in(&bufD, {i});
        *    - Change the order of storage.
        *    D.store_in(&bufD, {j, i}) will store D(i,j) in bufD(j,i).
        *    - Store the computation in a circular buffer (modulo storage).
        *    D.store_in(&bufD, {i%4, j%4});
        *    This will store D(i,j) in bufD[i%4, j%4].  Assuming the buffer
        *    bufD is a 4x4 buffer.
        */
      // @{
      void store_in(buffer *buff);
      void store_in(buffer *buff, std::vector<expr> iterators);
      // }@

     /**
       * This function assumes that \p consumer consumes values produced by
       * this computation (which is the producer).
       *
       * Compute this computation as needed for each unique value of the
       * \p consumer.
       *
       * This computation is scheduled so that the values consumed by the
       * \p consumer are computed at the level \p L and in the same loop
       * nest of the consumer. \p L is a loop level in the consumer.
       *
       * If the consumer needs this computation to be computed redundantly,
       * the function creates the necessary redundant computations and schedules
       * them before the consumer.
       *
       * This function performs the following:
       *     - schedules this computation to be executed as needed before
       *     the consumer.
       *     - if this computation needs to be computed redundantly, redundant
       *     computations are create.
       *
       * This function does not:
       *     - create any data mapping to this computation. It is up to the
       *     user to provide an access relation to this computation as he
       *     would do to any other normal computation.
       *     - it does not allocate any buffer to this computation. It is
       *     up to the user to declare a buffer where the results of this
       *     computation will be stored.
       *
       * If this functions creates a duplicate of the computation, the user
       * does not need to set its access relation.  The duplicated computation
       * will automatically have the same access relation as the original
       * computation. This access relation is set automatically.
       *
       * This function does not return a handler to manipulate the duplicate
       * computation. It does not allow the user to manipulate the duplicate
       * freely.  The duplicate is scheduled automatically to be executed
       * before the consumer.
       */
     void compute_at(computation &consumer, tiramisu::var L);
     void compute_at(computation &consumer, int L);

     /**
       * Generates the time-space domain and construct an AST that scans that
       * time-space domain, then compute the depth of this AST.
       * This is useful for example to know if all the dimensions of the time-space
       * domain will correspond to a loop level in the final generated AST.
       */
     int compute_maximal_AST_depth();

     /**
       * Dump the iteration domain of the computation.  This is useful for
       * debugging.
       */
     void dump_iteration_domain() const;

     /**
       * Dump the schedule of the computation.  This is mainly useful for
       * debugging.
       *
       * The schedule is a relation between the iteration space and the
       * time space.  The relation provides a logical date of execution for
       * each point in the iteration space.
       * The schedule needs first to be set before calling this function.
       */
     void dump_schedule() const;

     /**
       * Dump the computation on stdout.  This is mainly useful for
       * debugging.
       */
     void dump() const;

     /**
       * Fuse this computation with the computation passed as argument
       * in the same loop.  Run this computation after that computation.
       * Fuse them at the loop level \p lev.
       *
       * For example, assuming we have the following computations
       *
       * {S0[i,j]: 0<=i<N and 0<=j<N}, {S1[i,j]: 0<=i<N and 0<=j<N}
       * and {S2[i,j]: 0<=i<N and 0<=j<N}.
       *
       * Without fusion, these computations would be equivalent
       * to the following loops nests
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *     S0;
       *
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *     S1;
       *
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *     S2;
       * \endcode
       *
       * To fuse them, one should call
       *
       * \code
       * S2.fuse_after(j, S1);
       * S1.fuse_after(j, S0);
       * \endcode
       *
       * This would result in fusing S2 with S0 and S1 at loop level j.
       * S2 will be scheduled for execution after S0 and S1.  The resulting code would look like
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<N; j++)
       *   {
       *     S0;
       *     S1;
       *     S2;
       *   }
       * \endcode
       *
       * Calling
       *
       * \code
       * S2.fuse_after(i, S1);
       * S1.fuse_after(i, S0);
       * \endcode
       *
       * would result in the following code
       *
       * \code
       * for (i=0; i<N; i++)
       * {
       *   for (j=0; j<N; j++)
       *     S0;
       *   for (j=0; j<N; j++)
       *     S1;
       *   for (j=0; j<N; j++)
       *     S2;
       * }
       * \endcode
       *
       */
     void fuse_after(tiramisu::var lev, computation &comp)
     {
         assert(lev.get_name().size() > 0);

         this->after(comp, lev);
     }

     /**
       * Generate the time-space domain of the computation.
       *
       * In this representation, the logical time of execution and the
       * processor where the computation will be executed are both
       * specified.  The memory location where computations will be
       * stored in memory is not specified at the level.
       */
     void gen_time_space_domain();

     /**
       * Specify that the rank loop iterator should be removed from linearization.
       */
     void drop_rank_iter(var level);

     /**
       * Get the data type of the computation.
       */
     tiramisu::primitive_t get_data_type() const;

     /**
       * Return the Tiramisu expression associated with the computation.
       */
     const tiramisu::expr &get_expr() const;

     /**
      * Return the iteration domain of the computation.
      * In this representation, the order of execution of computations
      * is not specified, the computations are also not mapped to memory.
      */
     isl_set *get_iteration_domain() const;

     /**
       * Get the last update of a computation.
       */
     tiramisu::computation& get_last_update();

     /**
       * Search the time-space domain (the range of the schedule) and
       * return the loop level number that correspond to the dimension
       * named \p dim.
       * In other words, translate the vector of dimension name (\p dim_name)
       * into a loop level number.
       */
     int get_loop_level_number_from_dimension_name(std::string dim_name)
     {
         return this->get_loop_level_numbers_from_dimension_names({dim_name})[0];
     }

     /**
       * Return the name of the computation.
       */
     const std::string &get_name() const;

     /**
       * Returns a pointer to the computation scheduled immediately before this computation,
       * or a null pointer if none exist.
       */
     computation * get_predecessor();

     /**
       * Returns the \p index update that has been added to this computation such that:
       * - If \p index == 0, then this computation is returned.
       * - If \p > 0, then it returns the pth computation added through add_definitions.
       */
     tiramisu::computation& get_update(int index);

     /**
      * Get the schedule of the computation.
      */
     isl_map *get_schedule() const;

     /**
       * Tile the computation and then tag the outermost tile dimension
       * to be mapped to GPU blocks and tag the innermost tile dimensions
       * to be mapped to GPU threads.
       *
       * Tile the two loop levels \p L0 and \p L1 with rectangular
       * tiling. \p sizeX and \p sizeY represent the tile size.
       * \p L0 and \p L1 should be two consecutive loop levels
       * (i.e., \p L0 = \p L1 + 1) and they should satisfy
       * \p L0 > \p L1.
       *
       * \p L0_outer, \p L1_outer, \p L0_inner, \p L1_inner
       * are the names of the new dimensions created after tiling.
       */
     // @{
     void gpu_tile(tiramisu::var L0, tiramisu::var L1, int sizeX, int sizeY);
     void gpu_tile(tiramisu::var L0, tiramisu::var L1, int sizeX, int sizeY,
                   tiramisu::var L0_outer, tiramisu::var L1_outer,
                   tiramisu::var L0_inner, tiramisu::var L1_inner);
     void gpu_tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, int sizeX, int sizeY, int sizeZ);
     void gpu_tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, int sizeX, int sizeY, int sizeZ,
                   tiramisu::var L0_outer, tiramisu::var L1_outer, tiramisu::var L2_outer,
                   tiramisu::var L0_inner, tiramisu::var L1_inner, tiramisu::var L2_inner);
     // @}

     /**
      * Return the buffer that was allocated automatically using
      * high level data mapping functions.
      * If no automatic buffer was allocated, this function returns NULL.
      */
     tiramisu::buffer *get_automatically_allocated_buffer();

     /**
       * Interchange (swap) the two loop levels \p L0 and \p L1.
       */
     void interchange(tiramisu::var L0, tiramisu::var L1);

     /**
       * Identical to
       *     void interchange(tiramisu::var L0, tiramisu::var L1);
       */
     void interchange(int L0, int L1);

     /**
       * Mark this statement as a let statement.
       */
     void mark_as_let_statement();

     /**
       * Mark this statement as a library call.
       */
     void mark_as_library_call();

    /**
       * Tag the loop level \p L to be parallelized.
       *
       * This function is equivalent to the function \ref tiramisu::computation::tag_parallel_level() .
       * There is no difference between the two.
       *
       */
     void parallelize(tiramisu::var L);

     /**
        * Set the access relation of the computation.
        *
        * The access relation is a relation from computations to buffer locations.
        * \p access_str is a string that represents the relation. It is encoded
        * in the ISL format, http://isl.gforge.inria.fr/user.html#Sets-and-Relations
        * of relations.
        *
        * Note that, in TIramisu, the access relations of computation that have the same name
        * must be identical.
        *
        * Examples: tutorial_01, tutorial_02, tutorial_08 (actually most tutorials have set_access()).
        */
      // @{
      void set_access(std::string access_str);
      void set_access(isl_map *access);
      // @}

     void set_wait_access(std::string access_str);
     void set_wait_access(isl_map *access);

      /**
        * Set the expression of the computation.
        */
      void set_expression(const tiramisu::expr &e);

      /**
        * Sets whether the computation is inline or not, based on the value of \p is_inline.
        * If a computation is inline, accesses to the computation return the expression of that
        * computation.
        * E.g. if an inline computation S(i,j) is defined with the expression i + j,
        * then S(i + 1, j * i) returns the expression i + 1 + j * i.
        * If \p is_inline is not provided, the computation is set to be inline.
        */
      void set_inline(bool is_inline = true);

      /**
        * Returns true if and only if the computation is inline.
        */
     const bool is_inline_computation() const;

      /**
        * Set the schedule indicated by \p map.
        *
        * \p map is a string that represents a mapping from the iteration domain
        * to the time-space domain (the ISL format to represent maps is
        * documented in http://barvinok.gforge.inria.fr/barvinok.pdf in Sec 1.2.2).
        *
        * The schedule is a map from the iteration domain to a time space
        * domain. The same name of space should be used for both the range
        * and the domain of the schedule.
        *
        * In general, users can set the schedule using high level functions such
        * as before(), after(), tile(), compute_at(), vectorize(), split(), ...
        * The use of this function is only reserved for advanced users who want
        * a low level control of the schedule.
        *
        * Vectors in the time-space domain have the following form
        *
        * computation_name[redundancy_ID,static,dynamic,static,dynamic,static,dynamic,static,...]
        *
        * The first dimension of the vector is used to indicate the redundancy ID
        * (the notion of the redundancy ID is explained later).
        *
        * The following dimensions are interleaved dimensions: static, dynamic, static,
        * dynamic, ...
        * Dynamic dimensions represent the loop levels, while static dimensions are
        * used to order statements within a given block of statements in a given loop
        * level.
        * For example, the computations c0 and c1 in the following loop nest
        *
        * \code
        * for (i=0; i<N: i++)
        *   for (j=0; j<N; j++)
        *   {
        *     c0;
        *     c1;
        *   }
        * \endcode
        *
        * have the following representations in the iteration domain
        *
        * \code
        * {c0[i,j]: 0<=i<N and 0<=j<N}
        * {c1[i,j]: 0<=i<N and 0<=j<N}
        * \endcode
        *
        * and the following representation in the time-space domain
        *
        * \code
        * {c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
        * {c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
        * \endcode
        *
        * The first dimension (dimension 0) in the time-space
        * domain (the leftmost dimension) is the redundancy ID
        * (in this case it is 0, the meaning of this ID will be explained later).
        * The second dimension (starting from the left) is a static dimension,
        * the third dimension is a dynamic dimension that
        * represents the loop level i, ..., the fifth dimension is a dynamic
        * dimension that represents the loop level j and the last dimension
        * (dimension 5) is a static dimension and allows the ordering of
        * c1 after c0 in the loop nest.
        *
        * To transform the previous iteration domain to the
        * time-space domain, the following schedule should be used:
        *
        * \code
        * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
        * {c1[i,j]->c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
        * \endcode
        *
        * The first dimension called "redundancy ID" is only meaningful if the
        * computation is redundant. i.e., some parts of the computation are
        * redundantly computed.  Redundant computations are in general used to
        * maximize parallelism and data locality on the expense of doing some
        * computations redundantly.
        * For example, if the two computations c1(i,j) and c2(i,j) both depend
        * on the computation c0(i,j), instead of waiting for c0(i,j) to be
        * computed and then computing c1(i,j) and c2(i,j) in parallel, the thread
        * executing c1(i,j) can compute c0(i,j) by itself and then run c1(i,j).
        * The thread that computes c2(i,j) can do the same and compute c0(i,j)
        * by itself and then compute c2(i,j). In this case the two threads do not
        * need to wait. This is done at the expense of redundant computation since
        * c0(i,j) is computed by both threads.
        *
        * In general redundant computations are useful when tiling stencil
        * computations.  In the context of stencils such a tiling is called
        * "overlapped tiling".  Tiles that depend on results computed by other
        * tiles that run in parallel can compute the boundaries redundantly which
        * allows them to avoid waiting and thus can run in parallel.
        *
        * In Tiramisu, the user can indicate that a chunk of a computation
        * should be computed redundantly. The original computation always has a redundancy
        * ID equal to 0 (which means this is the original computation).
        * The redundant chunk has an ID that is different from 0 and that is
        * used to uniquely identify it.
        *
        * For example if we want to compute all of c0 three times (that is,
        * compute the original computation and compute two redundant computations),
        * we can use the following schedules:
        *
        * The schedule of the original computation:      {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
        * The schedule of the redundant computation N°1: {c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N}
        * The schedule of the redundant computation N°2: {c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
        *
        * The schedule of c0 in this case would be three maps that map c0[i,j] to
        * the three different redundant computations in the time-processor domain:
        *
        * \code
        * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N;
        *  c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N;
        *  c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
        * \endcode
        *
        * The function set_schedule() overrides any other schedule set by the high level
        * scheduling functions.  Currently the user has to choose between using the high
        * level scheduling functions or using this low level set_schedule function. The user
        * cannot mix the use of the two in the same program because they are not compatible.
        */
      // @{
      void set_low_level_schedule(isl_map *map);
      void set_low_level_schedule(std::string map_str);
      // @}

     /**
       * Shift the loop level \p L0 of the iteration space by
       * \p n iterations.
       *
       * \p n can be a positive or a negative number. A positive
       * number means a shift forward of the loop iterations while
       * a negative value would mean a shift backward.
       */
     void shift(tiramisu::var L0, int n);

     /**
       * Apply loop skewing on the loop levels \p i and \p j with a skewing factor of \p f.
       * The names of the new loop levels is \p ni and \p nj.
       *
       * This command transforms the loop (i, j) into the loop (i, f*i+j).
       * For example if you have the following loop
       *
       * \code
       for (int i = 1; i < N; i++)
         for (int j = 1; j < M; j++)
           a[i][j] = a[i - 1][j] + a[i][j - 1];
        \endcode

       * and apply

       \code
     a.skew(i, j, 1, ni, nj);
       \endcode

       * you would get
       *
       \code
       for (int i = 1; i < N; i++)
         for (int j = i+1; j < i+M; j++)
           a[i][j - i] = a[i - 1][j - i] + a[i][j - i - 1];
       \endcode

       */
     void skew(tiramisu::var i, tiramisu::var j, int f, tiramisu::var ni, tiramisu::var nj);

     /**
       * Apply loop skewing on the loop levels \p i, \p j and \p k with a skewing factor of \p f.
       * The names of the new loop levels is \p ni, \p nj and \p nk.
       *
       * This command transforms the loop (i, j, k) into the loop (i, f*i+j, f*i+k).
       */
     void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, int factor,
           tiramisu::var ni, tiramisu::var nj, tiramisu::var nk);

     /**
       * Apply loop skewing on the loop levels \p i, \p j, \p k, \p l with a skewing factor of \p f.
       * The names of the new loop levels is \p ni, \p nj, \p nk and \p nl.
       *
       * This command transforms the loop (i, j, k, l) into the loop (i, f*i+j, f*i+k, f*i+l).
       */
     void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, tiramisu::var l, int factor,
           tiramisu::var ni, tiramisu::var nj, tiramisu::var nk, tiramisu::var nl);

     /**
       * \overload
       */
     void skew(tiramisu::var i, tiramisu::var j, int factor);

     /**
       * \overload
       */
     void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, int factor);

     /**
       * \overload
       */
     void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, tiramisu::var l, int factor);

     /**
       * \overload
       */
     void skew(int i, int j, int factor);

     /**
       * \overload
       */
     void skew(int i, int j, int k, int factor);

     /**
       * \overload
       */
     void skew(int i, int j, int k, int l, int factor);

     /**
       * Split the loop level \p L0 of the iteration space into two
       * new loop levels.
       *
       * \p sizeX is the extent (size) of the inner loop created after
       * splitting.
       */
     //@{
     void split(tiramisu::var L0, int sizeX);
     void split(tiramisu::var L0, int sizeX,
                tiramisu::var L0_outer, tiramisu::var L0_inner);
     //@}

     /**
       * Identical to
       *     void split(tiramisu::var L0, int sizeX);
       */
     void split(int L0, int sizeX);

     /**
      * Fold the storage of the computation.
      * Fold the loop level \p dim by a factor \p f.
      */
     void storage_fold(tiramisu::var dim, int f);

     /**
      * Allocate the storage of this computation in the loop level \p L0.
      *
      * This function does the following:
      *  - computes the size of the buffer needed to store this computation
      *  (TODO: current the size computed by Tiramisu is equal to the size
      *  of the computation, Tiramisu does not allocate smaller buffers if
      *  such a thing is possible, this is left for future work).
      *  - allocates a temporary buffer with the appropriate size,
      *  - schedules the allocation operation to be executed in the loop
      *  nest where \p comp is computated at the loop level \p L0.
      *
      * The function returns the computation (operation) that allocates
      * the buffer.  The allocated buffer is not returned.
      */
     tiramisu::computation *store_at(tiramisu::computation &comp,
                                     tiramisu::var L0);

     /**
       * Tag the loop level \p L0 and \p L1 to be mapped to GPU.
       */
     // @{
     void tag_gpu_level(tiramisu::var L0, tiramisu::var L1);
     void tag_gpu_level(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, tiramisu::var L3);
     void tag_gpu_level(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, tiramisu::var L3, tiramisu::var L4, tiramisu::var L5);
     // @}

     /**
       * Tag the loop level \p L to be parallelized.
       */
     void tag_parallel_level(tiramisu::var L);

     /**
       * Identical to
       *    void tag_parallel_level(int L);
       */
     void tag_parallel_level(int L);

     /**
       * Tag the loop level \p L to be vectorized.
       * \p len is the vector length.
       *
       * The user can only tag loop levels that have constant extent.
       * If a loop level does not have a constant extent, the user
       * should call .vectorize() command instead or he can call
       * separate() and split() manually.
       *
       * The user has to make sure that the extent of the dimension
       * is bigger than \p len. The vectorization of a loop that has
       * less than \p len iterations is not correct.
       *
       */
     void tag_vector_level(tiramisu::var L, int len);

     /**
       * Identical to
       *     void tag_vector_level(tiramisu::var L, int len);
       */
     void tag_vector_level(int L, int len);

     /**
       * Tag the loop level \p L to be distributed.
       */
     // @{
     void tag_distribute_level(tiramisu::var L);
     void tag_distribute_level(int L);
     // @}

     /**
       * Tag the loop level \p L to be unrolled.
       *
       * The user can only tag loop levels that have constant extent.
       */
     void tag_unroll_level(tiramisu::var L);

     /**
      * Identical to
      *     void tag_unroll_level(tiramisu::var L);
      */
     void tag_unroll_level(int L);

     /**
       * \brief Schedule this computation to run before the computation \p next_computation
       * at the loop level \p L and return \p next_computation.
       *
       * \details Notes
       *     - This method is a simple wrapper around computation::after to help schedule
       *       chaining as in:
       *       \code
       *       C1.then(C2, j)
       *         .then(C3, computation::root)
       *         .then(C4, i)
       *         .then(C5, j);
       *       \endcode
       *     - The loop level \p L is a loop level of this computation.
       *     - Use computation::root to indicate the root dimension
       *       (i.e. the outermost time-space dimension).
       *     - Calling this method with the same computations overwrites the level if it is
       *     higher.
       *     - A computation being scheduled after another computation at level L means it is
       *     scheduled after that computation at all levels lower than L.
       */
     // @{
     computation &then(computation &next_computation, tiramisu::var L);
     // @}

     /**
       * Tile the two loop levels \p L0 and \p L1 with rectangular
       * tiling. \p sizeX and \p sizeY represent the tile size.
       * \p L0 and \p L1 should be two consecutive loop levels.
       * \p L0_outer, \p L1_outer, \p L0_inner, \p L1_inner
       * are the names of the new dimensions created after tiling.
       */
     // @{
     void tile(tiramisu::var L0, tiramisu::var L1,
               int sizeX, int sizeY);
     void tile(tiramisu::var L0, tiramisu::var L1,
               int sizeX, int sizeY,
               tiramisu::var L0_outer, tiramisu::var L1_outer,
               tiramisu::var L0_inner, tiramisu::var L1_inner);
     void tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2,
               int sizeX, int sizeY, int sizeZ);
     void tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2,
               int sizeX, int sizeY, int sizeZ,
               tiramisu::var L0_outer, tiramisu::var L1_outer,
               tiramisu::var L2_outer, tiramisu::var L0_inner,
               tiramisu::var L1_inner, tiramisu::var L2_inner);
     // @}

     /**
       * Tile the two loop levels \p L0 and \p L1 with rectangular
       * tiling. \p sizeX and \p sizeY represent the tile size.
       * \p L0 and \p L1 should be two consecutive loop levels
       * (i.e., \p L0 = \p L1 + 1) and they should satisfy
       * \p L0 > \p L1.
       */
     // @{
     void tile(int L0, int L1, int sizeX, int sizeY);
     void tile(int L0, int L1, int L2, int sizeX, int sizeY, int sizeZ);
     // @}

     /**
       * Unroll the loop level \p L with an unrolling factor \p fac.
       *
       * The difference between this function and the function
       * tag_unroll_level() is that this function separates
       * the iteration domain into full and partial iteration
       * domains for unrolling first and then it calls
       * tag_unroll_level().
       * tag_unroll_level() only tags a dimension to
       * be unrolled, it does not modify the tagged dimension.
       *
       * This function separates the iteration domain into two iteration
       * domains, a full iteration domain and a partial iteration domain.
       * The full iteration domain has an upper bound that is multiple of
       * \p fac while the other does not.
       * The full iteration domain is then split by \p fac and the inner loop
       * (which should have a constant extent equal to \p fac) is tagged as
       * a unrolled loop.
       *
       * Let us assume the following loop (a loop represents and iteration
       * domain)
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<23; j++)
       *     S0;
       * \endcode
       *
       * To unroll the j loop with an unrolling factor of 4, one should call
       *
       *      S0.unroll(j, 4);
       *
       * The loop (iteration domain) is first separated into the following
       * two loops
       *
       * \code
       * for (int i=0; i<20; i++)
       *   S0;
       *
       * for (int i=20; i<23; i++)
       *   S0;
       * \endcode
       *
       * The full loop is then split by 4
       *
       * \code
       * for (int i1=0; i1<20/4; i1++)
       *   for (int i2=0; i2<4; i2++)
       *      S0;
       *
       * for (int i=20; i<23; i++)
       *   S0;
       * \endcode
       *
       * the i2 loop is then tagged to be unrolled.
       *
       * \p L_outer and \p L_inner are the names of the new loops created
       * after splitting. If not provided, default names will be assigned.
       * \p L_outer is the outer loop.
       *
       */
     //@{
     void unroll(tiramisu::var L, int fac);
     void unroll(tiramisu::var L, int fac, tiramisu::var L_outer, tiramisu::var L_inner);
     //@}

     /**
       * Vectorize the loop level \p L.  Use the vector length \p v.
       *
       * The difference between this function and the function
       * tag_vector_level() is that this function
       * prepares the iteration domain for vectorization first
       * and then it calls tag_vector_level().
       * tag_vector_level() only tags a dimension to
       * be vectorized, it does not change the tagged dimension.
       *
       * This function will separate the iteration domain into two iteration
       * domains, a full iteration domain and a partial iteration domain.
       * The full iteration domain has an upper bound that is multiple of
       * \p v while the other does not.
       * The full iteration domain is then split by \p v and the inner loop
       * (which should have a constant extent equal to \p v) is tagged as
       * a vector loop.
       *
       * Let us assume the following loop (a loop represents and iteration
       * domain)
       *
       * \code
       * for (i=0; i<N; i++)
       *   for (j=0; j<23; j++)
       *     S0;
       * \endcode
       *
       * To vectorize the j loop with a vector length 4, one should call
       *
       *      S0.vectorize(j, 4);
       *
       * The loop (iteration domain) is first separated into the following
       * two loops
       *
       * \code
       * for (int i=0; i<20; i++)
       *   S0;
       *
       * for (int i=20; i<23; i++)
       *   S0;
       * \endcode
       *
       * The full loop is then split by 4
       *
       * \code
       * for (int i1=0; i1<20/4; i1++)
       *   for (int i2=0; i2<4; i2++)
       *      S0;
       *
       * for (int i=20; i<23; i++)
       *   S0;
       * \endcode
       *
       * the i2 loop is then tagged to be vectorized.
       *
       * The user has to make sure that the extent of the dimension
       * is bigger than \p v. The vectorization of a loop that has
       * less than \p v iterations is not correct.
       *
       * The names of the new loop iterators created after vectorization
       * are \p L_outer and \p L_inner.  If not provided, default names
       * assigned.
       */
     // @{
     void vectorize(tiramisu::var L, int v);
     void vectorize(tiramisu::var L, int v, tiramisu::var L_outer, tiramisu::var L_inner);
     // @}

     /**
       * root_dimension is a number used to specify the dimension level
       * known as root.
       * The root dimension level is the outermost level.  It is the level
       * outside any loop nest.  Loop level 0 is the level of the first loop
       * (outermost loop), loop 1 is the level of following inner loop, ...
       *
       * Where is this number used ?
       *
       * These numbers are used in the helper functions used for scheduling
       * (such as after(), before(), ...).
       * For example, c0.after(c1) indicates that the computation c0 should
       * be executed after the computation c1.
       * Since the two computations c0 and c1 are usually nested in a loop,
       * we need to specify at which loop level c0 is after c1. This is where
       * we need to specify the loop level numbers.
       * Here is an example.  Suppose that the two computations c0 and c1
       * have the following iteration domains
       * {c0[i,j]: 0<=i<N and 0<=j<N} and {c1[i,j]: 0<=i<N and 0<=j<N}.
       *
       * When code is generated for the two computations, two loop nests
       * are generated.  When scheduling c0 after c1 using the after function,
       * the user can choose one among three possibilities in specifying at
       * which level c0 is after c1.
       *
       * - c0.after(c1, computation::root_dimension) would create a schedule
       * that generates the following code
       *
       * \code
       * for (i=0; i<N; i++)
       *     for (j=0; j<N; j++)
       *         c1;
       * for (i=0; i<N; i++)
       *     for (j=0; j<N; j++)
       *         c0;
       * \endcode
       *
       * - c0.after(c1, 0) would create a schedule that generates the
       * following code
       *
       * \code
       * for (i=0; i<N; i++) {
       *     for (j=0; j<N; j++)
       *         c1;
       *     for (j=0; j<N; j++)
       *         c0;
       * }
       * \endcode
       *
       * This means that c0 is after c1 starting from loop level 0,
       * (before the loop level 0, c0 and c1 have the same order).
       *
       * - c0.after(c1, 1) would create a schedule that generates the
       * following code
       *
       * \code
       * for (i=0; i<N; i++)
       *     for (j=0; j<N; j++) {
       *         c1;
       *         c0;
       *     }
       * \endcode
       *
       * This means that c0 is after c1 starting from loop level 1,
       * (before the loop level 1, c0 and c1 have the same order).
       */
     const static var root;

     /**
       * Equivalent of computation::root but to be used with scheduling
       * functions that take loop level (integers) as input instead of
       * tiramisu::var.
       */
     const static int root_dimension = -1;

     /**
       * Access operator: C0(i,j) represents an access to
       * the element (i,j) of the computation C0.
       * C0(i,j) represents the value computed by the computation
       * C0(i,j)
       */
     template<typename... Args> tiramisu::expr operator()(Args... args)
     {
         // TODO move to cpp
         std::vector<tiramisu::expr> access_expressions{std::forward<Args>(args)...};
         if (access_expressions.size() != number_of_dims)
     {
         tiramisu::str_dump("Error - Incorrect access: " + this->get_name() + "(");
         for (int i = 0; i < access_expressions.size(); i++)
         {
         tiramisu::expr e = access_expressions[i];
         e.dump(false);
         if (i != access_expressions.size() - 1)
             tiramisu::str_dump(", ");
         }
         tiramisu::str_dump(").\n");
         tiramisu::str_dump("The number of access dimensions does not match that used in the declaration of " + this->get_name() + ".\n\n");
         exit(1);
     }

         if (this->is_inline_computation()) {
             std::vector<std::pair<var, expr>> substitutions;
             for (auto const &variable: this->access_variables) {
                 // variable is an (index, variable_name) pair
                 substitutions.push_back(std::make_pair(var(variable.second, false),
                                                        access_expressions[variable.first]));
             }
             // TODO add iteration space for expression
             return this->expression.substitute(substitutions);
         } else {
             return tiramisu::expr(tiramisu::o_access,
                                   this->get_name(),
                                   access_expressions,
                                   this->get_data_type());
         }
     }

     operator expr();

     static xfer create_xfer(std::string send_iter_domain, std::string recv_iter_domain, tiramisu::expr send_dest,
                             tiramisu::expr recv_src, xfer_prop send_prop, xfer_prop recv_prop,
                             tiramisu::expr send_expr, tiramisu::function *fct);

     static xfer create_xfer(std::string iter_domain, xfer_prop prop, tiramisu::expr expr,
                             tiramisu::function *fct);

 };

 class input: public computation
 {

 public:
     /**
       * \brief Constructor for an input.
       *
       * \details
       *
       * Declare an input.
       *
       * \p name is the name of the input.
       *
       * \p iterator_variables is a vector that represents the dimensions of
       * the input.  It is used to define the size of the input.
       *
       * \p t is the type of the input elements.
       * Example of types include (p_uint8, p_uint16, p_uint32, ...).
       * Types are defined in \ref tiramisu::primitive_t
       *
       * Example:
       *
       * To declare a buffer buf[20, 30] where the buffer elements
       * are of type uint8.  We can first declare two iterator variables
       *
       * \code
       * var i("i", 0, 20), j("j", 0, 30);
       * \endcode
       *
       * and then we can declare the following input
       *
       * \code
       * input A("A", {i,j}, p_uint8);
       * \endcode
       *
       * Later in the code (in Layer III), we need to actually declare
       * the buffer and map this input to that buffer.
       *
       * An example is provided in tutorial 02.
       *
      */
     input(std::string name, std::vector<var> iterator_variables, primitive_t t):
         computation(name, iterator_variables, expr(), false)
     {
         this->data_type = t;
         this->expression.dtype = t;
     }

     /**
       * \overload
       */
     input(std::vector<var> iterator_variables, primitive_t t):
         input(generate_new_computation_name(), iterator_variables, t)
     {
     }
 };

 class Input: public input{

 public:
     std::vector<var> iterators_from_size_expressions(std::vector<expr> sizes)
     {
     std::vector<var> iterator_variables;

     for (int s = 0; s < sizes.size(); s++)
     {
         var v(generate_new_computation_name(), 0, sizes[s]);
         iterator_variables.push_back(v);
     }

     return iterator_variables;
     }

     /**
       * \overload
       */
     Input(std::string name, std::vector<expr> sizes, primitive_t t)
     :input(name, iterators_from_size_expressions(sizes), t)
     {
     }
 };

 class view:public computation{

 public:
     /**
       * \brief Constructor for a view.
       *
       * \details
       *
       * Declare a view.
       *
       * \p name is the name of the view.
       *
       * \p iterator_variables is a vector that represents the dimensions of
       * the view.  It is used to define the size of the view.
       *
       * \p t is the type of buffer data (p_float32 for example).
       *
       * Example:
       *
       * If we want to access a buffer buf where results of a computation C are stored
       * we declare a view V and we map it to the buffer buf.
       *
       * We declare the iterator variables
       *
       * \code
       * var i("i", 0, 20), j("j", 0, 30);
       * \endcode
       *
       * and then we can declare the following view
       *
       * \code
       * view V("V", {i,j}, p_float64);
       * \endcode
       *
       * Later in the code (in Layer III), we need to map the view to that buffer.
       * \code
       * V.store_in(&buf);
       * \endcode
      */
    view(std::string name, std::vector<var> iterator_variables, primitive_t t):
     computation(name, iterator_variables, expr(), false,t){}

 };


 /**
   * A class that represents loop invariants.
   *
   * An object of the invariant class can be an expression, a symbolic constant
   * or a variable that is invariant to all the loops of the function.
   */
 class constant: public computation
 {
 private:
     /**
       * If this constant is not function wide, i.e., if it is computed
       * with a computation, then \p compute_with_computation is a pointer on the
       * computation with whom this constant should be computed.
       * We need to know this because we need to translate the accesses
       * used within the RHS of this contant expression to the new scheduled
       * accesses. Since a computation does not have a buffer access (it is
       * translated into a let statement), it also does not have an iterator
       * map, thus it cannot translate its accesses to scheduled accesses.
       * Thus we use the iterator map of the computation with whome we
       * compute this constant and take its iterator map.
       */
     tiramisu::computation *compute_with_computation;

 public:

     /**
       * Create a constant where \p param_name is the name of
       * the constant that will hold the value of the constant.
       *
       * \p param_expr is the expression that defines the value
       * of the constant.
       *
       * \p t indicates the type of the constant.
       *
       * \p function_wide should be set to true if the constant is
       * defined at the entry of the function and is visible to all
       * the computations.
       * If function_wide is set to true, then the constant is an
       * invariant to the whole function where it is declared.
       *
       * \p with_computation, should be set only if function_wide
       * is false, i.e. if the constant is not function wide.
       * In such a case the user should indicate where the
       * constant should be assigned.
       * \p with_computation indicates that the assignment should
       * be in the loop nest that computes the computation indicated by
       * \p with_computation at the loop level indicated
       * by \p at_loop_level.
       * The root level (i.e. the level outer than any other loop level)
       * is computation::root_dimension.
       * 0 represents the first loop level and 1 represents the second
       * loop level, ...
       *
       * \p func is the function in which the constant is defined. If this
       * argument is not provided, the implicit Tiramisu function is used
       * (i.e., the function created during Tiramisu initialization).
       */
     constant(std::string param_name, const tiramisu::expr &param_expr,
              tiramisu::primitive_t t,
              bool function_wide,
              tiramisu::computation *with_computation,
              int at_loop_level,
              tiramisu::function *func = global::get_implicit_function());

     constant(std::string param_name, const tiramisu::expr &param_expr,
              tiramisu::primitive_t t,
              tiramisu::function *func = global::get_implicit_function());

     /**
       * \brief Create a constant.
       *
       * \details
       *
       * Define a constant (scalar) at the entry of the function and make it visible to all
       * the computations.
       *
       * \p param_name is the name of the constant.
       * \p param_expr is the expression that defines the value of the constant.
       *
       */
     constant(std::string param_name, const tiramisu::expr &param_expr);

     /**
       * Get the computation with whom this constant is computed.
       */
     tiramisu::computation *get_computation_with_whom_this_is_computed()
     {
         return compute_with_computation;
     }

     /**
       * Dump the invariant on standard output (dump most of the fields of
       * the invariant class).
       * This is mainly useful for debugging.
       * If \p exhaustive is set to true, all the fields of the invariant
       * class are printed.  This is useful to find potential initialization
       * problems.
       */
     void dump(bool exhaustive) const;

     operator expr();
 };


 /**
 * A class for code generation.
 */
 class generator
 {
     friend function;
     friend computation;
     friend buffer;
     friend cuda_ast::generator;

 protected:

     /*
      * Compute the iterators map.
      * The iterators map is map between the original names of the iterators of a computation
      * and their transformed form after schedule (also after renaming).
      *
      * If in the original computation, we had
      *
      * {C[i0, i1]: ...}
      *
      * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
      * the loops are tiled, then the map will be
      *
      * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
      *
      **/
     static std::map<std::string, isl_ast_expr *>
         compute_iterators_map(tiramisu::computation *comp, isl_ast_build *build);

    /**
      * Get the computation associated with a node.
      */
     static std::vector<tiramisu::computation *>
         get_computation_by_node(tiramisu::function *fct, isl_ast_node *node);

     /**
      * Traverse the vector of computations \p comp_vec and return the computations
      * that have a domain that intersects with \p domain.
      */
     static std::vector<tiramisu::computation *> filter_computations_by_domain(std::vector<tiramisu::computation *> comp_vec,
             isl_union_set *node_domain);

     /**
      * Compute the accesses of the RHS of the computation
      * \p comp and store them in the accesses vector.
      *
      * If \p return_buffer_accesses is set to true, this function returns access functions to
      * buffers. Otherwise it returns access functions to computations.
      */
     static void get_rhs_accesses(const tiramisu::function *func, const tiramisu::computation *comp,
                           std::vector<isl_map *> &accesses, bool return_buffer_accesses);

     /**
      * Analyze the \p access_expression and return a set of constraints
      * that correspond to the access pattern of the access_expression.
      *
      * access_dimension:
      *      The dimension of the access. For example, the access
      *      C0(i0, i1, i2) have three access dimensions: i0, i1 and i2.
      * access_expression:
      *      The expression of the access.
      *      This expression is parsed recursively (by calling get_constraint_for_access)
      *      and is gradually used to update the constraint.
      * access_relation:
      *      The access relation that represents the access.
      * cst:
      *      The constraint that defines the access and that is being constructed.
      *      Different calls to get_constraint_for_access modify this constraint
      *      gradually until the final constraint is created. Only the final constraint
      *      is added to the access_relation.
      * coeff:
      *      The coefficient in which all the dimension coefficients of the constraint
      *      are going to be multiplied. This coefficient is used to implement o_minus,
      *      o_mul and o_sub.
      */
     static isl_constraint *get_constraint_for_access(int access_dimension,
                                                      const tiramisu::expr &access_expression,
                                                      isl_map *access_relation,
                                                      isl_constraint *cst,
                                                      int coeff,
                                                      const tiramisu::function *fct);


     /**
       * Generate a Halide statement from an ISL ast node object in the ISL ast
       * tree.
       * Level represents the level of the node in the schedule. 0 means root.
       * It taks as input:
       *     - a function \p fct for which we are generating code,
       *     - a \p node,
       *     - \p level represents the current loop level being traversed (0 means the outer level.
       *     - \p is_a_child_block indicates whether the block that is ging to be
       *     generated is a child block for an other block. In such a case, allocate
       *     and let statements should not be generate. Allocate and let statements
       *     should only be generated in non-child blocks so that their scope reaches
       *     the whole block.
       */
     static Halide::Internal::Stmt halide_stmt_from_isl_node(const tiramisu::function &fct, isl_ast_node *node,
                                                             int level,
                                                             std::vector<std::pair<std::string, std::string>> &tagged_stmts,
                                                             bool is_a_child_block);

     // TODO doc
     static Halide::Internal::Stmt make_halide_block(const Halide::Internal::Stmt &first,
             const Halide::Internal::Stmt &second);

     static Halide::Internal::Stmt make_buffer_alloc(buffer *b, const std::vector<Halide::Expr> &extents,
                                                     Halide::Internal::Stmt &stmt);
     static Halide::Internal::Stmt make_buffer_free(buffer *b);

     /**
      * Create a Halide expression from a  Tiramisu expression.
      */
     static Halide::Expr halide_expr_from_tiramisu_expr(const tiramisu::function *fct,
             std::vector<isl_ast_expr *> &index_expr,
             const tiramisu::expr &tiramisu_expr, tiramisu::computation *comp = nullptr);

     static tiramisu::expr replace_accesses(const tiramisu::function * func, std::vector<isl_ast_expr *> &index_expr,
                                            const tiramisu::expr &tiramisu_expr);
     static std::string get_buffer_name(const tiramisu::computation *);
     static tiramisu::expr comp_to_buffer(tiramisu::computation * comp, std::vector<isl_ast_expr *> &ie,
                                                     const tiramisu::expr * expr = nullptr);

     /**
      * Linearize a multidimensional access to a Halide buffer.
      * Supposing that we have buf[N1][N2][N3], transform buf[i][j][k]
      * into buf[k + j*N3 + i*N3*N2].
      * Note that the first arg in index_expr is the buffer name.  The other args
      * are the indices for each dimension of the buffer.
      */
     //@{
     static Halide::Expr linearize_access(int dims, const halide_dimension_t *shape, isl_ast_expr *index_expr);
     static Halide::Expr linearize_access(int dims, const halide_dimension_t *shape, std::vector<tiramisu::expr> index_expr);
     static Halide::Expr linearize_access(int dims, std::vector<Halide::Expr> &strides, std::vector<tiramisu::expr> index_expr);
     static Halide::Expr linearize_access(int dims, std::vector<Halide::Expr> &strides, isl_ast_expr *index_expr);
     static tiramisu::expr linearize_access(int dims, std::vector<tiramisu::expr> &strides, std::vector<tiramisu::expr> index_expr);
     static tiramisu::expr linearize_access(int dims, std::vector<tiramisu::expr> &strides, isl_ast_expr *index_expr);
     //@}

     /**
      * Retrieve the access function of the ISL AST leaf node (which represents a
      * computation). Store the access in computation->access.
      */
     static isl_ast_node *stmt_code_generator(isl_ast_node *node, isl_ast_build *build, void *user);

     /**
      * Traverse a tiramisu expression (\p exp) and extract the access relations
      * from the access operation passed in \p exp.  The access relations are added
      * to the vector \p accesses.
      * The access relation is from the domain of the computation \p comp to the
      * domain of the computation accessed by the access operation.
      * If \p return_buffer_accesses = true, an access to a buffer is created
      * instead of an access to computations.
      */
     static void traverse_expr_and_extract_accesses(const tiramisu::function *fct,
                                             const tiramisu::computation *comp,
                                             const tiramisu::expr &exp,
                                             std::vector<isl_map *> &accesses,
                                             bool return_buffer_accesses);

     /**
      * Traverse a tiramisu expression (\p current_exp) until an expression with the specified name is found.
      * Replace that name with a new name. Replaces all occurrences.
      */
     static void _update_producer_expr_name(tiramisu::expr &current_exp, std::string name_to_replace,
                                            std::string replace_with);

 public:

     /**
      * Traverse a tiramisu expression (\p current_exp) until an expression with the specified name is found.
      * Replace that name with a new name. Replaces all occurrences.
      */
     static void update_producer_expr_name(tiramisu::computation *comp, std::string name_to_replace,
                                           std::string replace_with);
 };

 /**
  * A class containing utility functions.
  */
 class utility
 {
 public:
     /**
      * Traverse recursively the ISL AST tree
      *
      * \p node represents the root of the tree to be traversed.
      *
      * \p dim is the dimension of the loop from which the bounds have to be
      * extracted.
      *
      * \p upper is a boolean that should be set to true to extract
      * the upper bound and false to extract the lower bound.
      */
      static expr extract_bound_expression(isl_ast_node *ast, int dim, bool upper);

     /**
      * Return a tiramisu::expr representing the bound of
      * the dimension \p dim in \p set.  If \p upper is true
      * then this function returns the upper bound otherwise
      * it returns the lower bound.
      *
      * For example, assuming that
      *
      * S = {S[i,j]: 0<=i<N and 0<=j<N and i<M}
      *
      * then
      *
      * get_upper_bound(S, 1)
      *
      * would return N-1, while
      *
      * get_upper_bound(S, 0)
      *
      * would return min(N-1,M-1)
      */
     static tiramisu::expr get_bound(isl_set *set, int dim, int upper);

     /**
      * Create a comma separated string that represents the list
      * of the parameters of \p set.
      *
      * For example, if the set is
      *
      * [N,M,K]->{S[i]}
      *
      * this function returns the string "N,M,K".
      */
     static std::string get_parameters_list(isl_set *set);

 };

 // TODO Jess: add doc comments

 class xfer_prop {
     friend class communicator;
 private:

     std::vector<tiramisu::xfer_attr> attrs;

     tiramisu::primitive_t dtype;

     int xfer_prop_id;

     xfer_prop();

 public:

     xfer_prop(tiramisu::primitive_t d_type, std::initializer_list<tiramisu::xfer_attr> attrs);

     xfer_prop(tiramisu::primitive_t d_type, std::initializer_list<tiramisu::xfer_attr> attrs,
               int xfer_prop_id);

     static std::set<int> xfer_prop_ids;

     static std::string attr_to_string(xfer_attr attr);

     tiramisu::primitive_t get_dtype() const;

     int get_xfer_prop_id() const;

     bool contains_attr(tiramisu::xfer_attr attr) const;

     bool contains_attrs(std::vector<tiramisu::xfer_attr> attrs) const;

     void add_attr(tiramisu::xfer_attr attr);

 };

 class communicator : public computation {
 private:

     std::vector<tiramisu::expr> dims;

 protected:

     xfer_prop prop;

     communicator();

 public:

     communicator(std::string iteration_domain_str, tiramisu::expr rhs, bool schedule_this_computation,
                  tiramisu::primitive_t data_type, tiramisu::function *fct);

     communicator(std::string iteration_domain_str, tiramisu::expr rhs,
                  bool schedule_this_computation, tiramisu::primitive_t, tiramisu::xfer_prop prop,
                  tiramisu::function *fct);

     xfer_prop get_xfer_props() const;

     tiramisu::expr get_num_elements() const;

     void add_dim(tiramisu::expr size);

     /**
       * Collapse a loop level.
       */
     std::vector<communicator *> collapse(int level, tiramisu::expr collapse_from_iter,
                                          tiramisu::expr collapse_until_iter, tiramisu::expr num_collapsed);

     /**
       * Collapse several consecutive loop levels (must collapse from innermost towards outermost).
       */
     void collapse_many(std::vector<collapse_group> collapse_each);

 };

 class send : public communicator {
 private:

     tiramisu::computation *producer = nullptr;

     recv *matching_recv = nullptr;

     tiramisu::expr msg_tag;

     tiramisu::expr src;

     tiramisu::expr dest;

     static int next_msg_tag;

 public:
     // TODO (Jess) is producer ever used?
     send(std::string iteration_domain_str, tiramisu::computation *producer, tiramisu::expr rhs,
          xfer_prop prop, bool schedule_this_computation, std::vector<expr> dims,
          tiramisu::function *fct);

     virtual bool is_send() const override;

     virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                  bool schedule_this_computation, tiramisu::primitive_t t,
                                  tiramisu::function *fct) override;

     tiramisu::computation *get_producer() const;

     tiramisu::recv *get_matching_recv() const;

     tiramisu::expr get_msg_tag() const;

     tiramisu::expr get_src() const;

     tiramisu::expr get_dest() const;

     void set_matching_recv(tiramisu::recv *matching_recv);

     void set_src(tiramisu::expr src);

     void set_dest(tiramisu::expr dest);

     void override_msg_tag(tiramisu::expr msg_tag);
 };

 class recv : public communicator {
 private:

     send *matching_send = nullptr;

     tiramisu::expr src;

     tiramisu::expr dest;

     tiramisu::expr msg_tag;

 public:

     recv(std::string iteration_domain_str, bool schedule_this, tiramisu::xfer_prop prop,
          tiramisu::function *fct);

     virtual bool is_recv() const override;

     virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                  bool schedule_this_computation, tiramisu::primitive_t t,
                                  tiramisu::function *fct) override;

     tiramisu::send *get_matching_send() const;

     tiramisu::expr get_msg_tag() const;

     tiramisu::expr get_src() const;

     tiramisu::expr get_dest() const;

     void set_matching_send(tiramisu::send *matching_send);

     void set_src(tiramisu::expr src);

     void set_dest(tiramisu::expr dest);

     void override_msg_tag(tiramisu::expr msg_tag);

 };

 class send_recv : public communicator {
 public:

     send_recv(std::string iteration_domain_str, tiramisu::computation *producer,
                   tiramisu::expr rhs, xfer_prop prop, bool schedule_this_computation,
                   std::vector<expr> dims, tiramisu::function *fct);

     virtual bool is_send_recv() const override;

 };

 class wait : public communicator {
 private:

     tiramisu::expr rhs;

 public:

     wait(tiramisu::expr rhs, xfer_prop prop, tiramisu::function *fct);

     wait(std::string iteration_domain_str, tiramisu::expr rhs, xfer_prop prop, bool schedule_this,
          tiramisu::function *fct);

     virtual bool is_wait() const override;

     virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
                                  bool schedule_this_computation, tiramisu::primitive_t t,
                                  tiramisu::function *fct) override;

     std::vector<tiramisu::computation *> get_op_to_wait_on() const;

 };

 // Halide IR specific functions

 void halide_stmt_dump(Halide::Internal::Stmt s);

 Halide::Module lower_halide_pipeline(
     const std::string &pipeline_name,
     const Halide::Target &t,
     const std::vector<Halide::Argument> &args,
     const Halide::Internal::LoweredFunc::LinkageType linkage_type,
     Halide::Internal::Stmt s);

 int loop_level_into_dynamic_dimension(int level);
 int loop_level_into_static_dimension(int level);
 /**
   * TODO code cleaning:
   * - Go to the tutorials, add a small explanation about how Tiramisu should work in general.
   * - Add two pages explaining how one should use Tiramisu,
   *
   * - Have documentation on header files only,
   * - Order the functions in the class computations (get functions then update functions ordered in alphabetical order),
   * - Clean/document expr.h and type.h
   */

 }

 #endif
tiramisu::argument_t
argument_t
Types of function arguments.
Definition: type.h:133

tiramisu::global::get_implicit_function
static function * get_implicit_function()
Return the implicit function created during Tiramisu initialization.
Definition: expr.h:80

tiramisu::CPU2CPU
Definition: core.h:80

tiramisu::function::automatically_allocated
std::vector< tiramisu::computation * > automatically_allocated
Keeps track of allocation computations created using allocate_and_map_buffer_automatically() to sched...
Definition: core.h:469

tiramisu::a_temporary
Definition: type.h:137

tiramisu::cuda_ast::generator
Definition: cuda_ast.h:656

tiramisu::CUDA
Definition: core.h:79

tiramisu::computation
A class that represents computations.
Definition: core.h:1320

tiramisu::communicator
Definition: core.h:4555

tiramisu::SYNC
Definition: core.h:74

tiramisu::Input
Definition: core.h:4109

tiramisu::lower_halide_pipeline
Halide::Module lower_halide_pipeline(const std::string &pipeline_name, const Halide::Target &t, const std::vector< Halide::Argument > &args, const Halide::Internal::LoweredFunc::LinkageType linkage_type, Halide::Internal::Stmt s)

tiramisu::HalideCodegenOutput::computation_list
std::map< std::string, tiramisu::computation * > computation_list
Definition: core.h:47

tiramisu::primitive_t
primitive_t
tiramisu data types.
Definition: type.h:27

tiramisu::expr::dtype
tiramisu::primitive_t dtype
Data type.
Definition: expr.h:219

tiramisu::HalideCodegenOutput::constant_list
std::map< std::string, tiramisu::constant * > constant_list
Definition: core.h:48

tiramisu::loop_level_into_static_dimension
int loop_level_into_static_dimension(int level)

tiramisu::wait
Definition: core.h:4691

tiramisu::codegen
void codegen(const std::vector< tiramisu::buffer * > &arguments, const std::string obj_filename, const bool gen_cuda_stmt=false)
Generate code.

tiramisu::xfer_prop::xfer_prop_ids
static std::set< int > xfer_prop_ids
Definition: core.h:4539

tiramisu::halide_expr_from_tiramisu_expr
Halide::Expr halide_expr_from_tiramisu_expr(const tiramisu::computation *comp, std::vector< isl_ast_expr * > &index_expr, const tiramisu::expr &tiramisu_expr)
Convert a Tiramisu expression into a Halide expression.

tiramisu::computation::wait_index_expr
isl_ast_expr * wait_index_expr
An index expression just for the request buffer.
Definition: core.h:2284

tiramisu::expr::substitute
expr substitute(std::vector< std::pair< var, expr >> substitutions) const
Returns a new expression where for every (var, sub) pair in substitutions, var in the original expres...

tiramisu::HalideCodegenOutput::HalideCodegenOutput
HalideCodegenOutput(const std::map< std::string, tiramisu::computation * > &computations, const std::map< std::string, tiramisu::constant * > &constants, const std::map< std::string, tiramisu::buffer * > &buffers)
Definition: core.h:51

tiramisu::Input::Input
Input(std::string name, std::vector< expr > sizes, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:4128

tiramisu::expr::dump
void dump(bool exhaustive) const
Dump the object on standard output (dump most of the fields of the expression class).
Definition: expr.h:1021

tiramisu::CPU2GPU
Definition: core.h:81

tiramisu::view::view
view(std::string name, std::vector< var > iterator_variables, primitive_t t)
Constructor for a view.
Definition: core.h:4173

tiramisu::input::input
input(std::string name, std::vector< var > iterator_variables, primitive_t t)
Constructor for an input.
Definition: core.h:4093

tiramisu::init
void init(std::string name)
Initialize the Tiramisu compiler and set Tiramisu options to default values.

tiramisu::computation::library_call_name
std::string library_call_name
If the computation represents a library call, this is the name of the function.
Definition: core.h:2279

tiramisu::buffer
A class that represents buffers.
Definition: core.h:1017

tiramisu::computation::fuse_after
void fuse_after(tiramisu::var lev, computation &comp)
Fuse this computation with the computation passed as argument in the same loop.
Definition: core.h:3230

tiramisu::isl_map_add_free_var
isl_map * isl_map_add_free_var(const std::string &free_var_name, isl_map *map, isl_ctx *ctx)

tiramisu::sync
A class that represents a synchronization object.
Definition: expr.h:1688

tiramisu::function::use_low_level_scheduling_commands
bool use_low_level_scheduling_commands
A boolean set to true if low level scheduling was used in the program.
Definition: core.h:736

tiramisu::Input::iterators_from_size_expressions
std::vector< var > iterators_from_size_expressions(std::vector< expr > sizes)
Definition: core.h:4112

tiramisu::computation::_is_library_call
bool _is_library_call
True if this computation represents a library call.
Definition: core.h:2274

tiramisu::computation::computation
computation(std::string name, std::vector< var > iterator_variables, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:2676

tiramisu::loop_level_into_dynamic_dimension
int loop_level_into_dynamic_dimension(int level)

tiramisu::recv
Definition: core.h:4640

tiramisu::collapse_group
std::tuple< int, tiramisu::expr, tiramisu::expr, tiramisu::expr > collapse_group
Loop level, loop start, loop end, number of elements to collapse.
Definition: core.h:95

tiramisu::expr
A class to represent tiramisu expressions.
Definition: expr.h:150

tiramisu::send
Definition: core.h:4594

tiramisu::MPI
Definition: core.h:78

tiramisu::function
A class to represent functions in Tiramisu.
Definition: core.h:131

tiramisu::o_access
Definition: type.h:118

tiramisu::communicator::prop
xfer_prop prop
Definition: core.h:4562

tiramisu::xfer::sr
tiramisu::send_recv * sr
Definition: core.h:89

tiramisu::GPU2CPU
Definition: core.h:82

tiramisu::function::sched_graph
std::unordered_map< tiramisu::computation *, std::unordered_map< tiramisu::computation *, int > > sched_graph
Stores all high level scheduling instructions between computations; i.e.
Definition: core.h:659

tiramisu::constant::get_computation_with_whom_this_is_computed
tiramisu::computation * get_computation_with_whom_this_is_computed()
Get the computation with whom this constant is computed.
Definition: core.h:4266

tiramisu::function::sched_graph_reversed
std::unordered_map< tiramisu::computation *, std::unordered_map< tiramisu::computation *, int > > sched_graph_reversed
Same as sched_graph, except in reverse order (from after to before).
Definition: core.h:665

tiramisu::xfer_prop
Definition: core.h:4520

tiramisu::NONBLOCK
Definition: core.h:77

tiramisu::generate_new_computation_name
std::string generate_new_computation_name()

tiramisu::HalideCodegenOutput::output_buffers
std::map< std::string, tiramisu::buffer * > output_buffers
Definition: core.h:49

tiramisu::constant
A class that represents loop invariants.
Definition: core.h:4187

tiramisu::view
Definition: core.h:4134

tiramisu::input::input
input(std::vector< var > iterator_variables, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:4103

tiramisu::op_t
op_t
Types of tiramisu operators.
Definition: type.h:53

tiramisu::xfer
Definition: core.h:86

tiramisu::ASYNC
Definition: core.h:75

tiramisu::cuda_ast::memory_location
memory_location
Definition: cuda_ast.h:177

tiramisu::xfer::r
tiramisu::recv * r
Definition: core.h:88

tiramisu::computation::computation
computation(std::vector< var > iterator_variables, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:2686

tiramisu::BLOCK
Definition: core.h:76

tiramisu::xfer_attr
xfer_attr
Definition: core.h:73

cuda_ast.h

tiramisu::HalideCodegenOutput
Definition: core.h:45

expr.h

tiramisu::expr::get_name
const std::string & get_name() const
Get the name of the ID or the variable represented by this expressions.
Definition: expr.h:702

tiramisu::split_string
void split_string(std::string str, std::string delimiter, std::vector< std::string > &vector)

tiramisu::computation::get_loop_level_number_from_dimension_name
int get_loop_level_number_from_dimension_name(std::string dim_name)
Search the time-space domain (the range of the schedule) and return the loop level number that corres...
Definition: core.h:3281

tiramisu::input
Definition: core.h:4052

tiramisu::halide_pipeline_to_tiramisu_function
HalideCodegenOutput halide_pipeline_to_tiramisu_function(Halide::Internal::Stmt s, const std::vector< Halide::Internal::Function > &outputs, const std::map< std::string, Halide::Internal::Function > &env, const std::map< std::string, std::vector< int32_t >> &output_buffers_size, tiramisu::function *func)

tiramisu
Definition: core.h:27

tiramisu::GPU2GPU
Definition: core.h:83

tiramisu::function::starting_computations
std::unordered_set< tiramisu::computation * > starting_computations
The set of all computations that have no computation scheduled before them.
Definition: core.h:729

tiramisu::var
A class that represents constant variable references.
Definition: expr.h:1704

tiramisu::computation::operator()
tiramisu::expr operator()(Args...args)
Access operator: C0(i,j) represents an access to the element (i,j) of the computation C0...
Definition: core.h:4005

tiramisu::utility
A class containing utility functions.
Definition: core.h:4466

tiramisu::get_computation_annotated_in_a_node
computation * get_computation_annotated_in_a_node(isl_ast_node *node)

tiramisu::halide_stmt_dump
void halide_stmt_dump(Halide::Internal::Stmt s)

tiramisu::computation::root
static const var root
root_dimension is a number used to specify the dimension level known as root.
Definition: core.h:3990

type.h

tiramisu::xfer::s
tiramisu::send * s
Definition: core.h:87

tiramisu::send_recv
Definition: core.h:4680

tiramisu::generator
A class for code generation.
Definition: core.h:4288