Tiramisu Compiler
core.h
Go to the documentation of this file.
1 #ifndef _H_TIRAMISU_CORE_
2 #define _H_TIRAMISU_CORE_
3 
4 #include <isl/set.h>
5 #include <isl/map.h>
6 #include <isl/union_map.h>
7 #include <isl/union_set.h>
8 #include <isl/ast_build.h>
9 #include <isl/schedule.h>
10 #include <isl/schedule_node.h>
11 #include <isl/space.h>
12 #include <isl/constraint.h>
13 
14 #include <map>
15 #include <string.h>
16 #include <stdint.h>
17 #include <unordered_map>
18 #include <unordered_set>
19 #include <sstream>
20 
21 #include <Halide.h>
22 #include <tiramisu/debug.h>
23 #include <tiramisu/expr.h>
24 #include <tiramisu/type.h>
25 #include "cuda_ast.h"
26 
27 namespace tiramisu
28 {
29 class view;
30 class input;
31 class function;
32 class computation;
33 class buffer;
34 class constant;
35 class generator;
36 class computation_tester;
37 class send;
38 class recv;
39 class send_recv;
40 class wait;
41 class sync;
42 class xfer_prop;
43 
44 
46 {
47  std::map<std::string, tiramisu::computation *> computation_list;
48  std::map<std::string, tiramisu::constant *> constant_list;
49  std::map<std::string, tiramisu::buffer *> output_buffers;
50 
51  HalideCodegenOutput(const std::map<std::string, tiramisu::computation *> &computations,
52  const std::map<std::string, tiramisu::constant *> &constants,
53  const std::map<std::string, tiramisu::buffer *> &buffers)
54  : computation_list(computations), constant_list(constants), output_buffers(buffers) {}
55 };
56 
58  Halide::Internal::Stmt s,
59  const std::vector<Halide::Internal::Function> &outputs,
60  const std::map<std::string, Halide::Internal::Function> &env,
61  const std::map<std::string, std::vector<int32_t>> &output_buffers_size,
62  tiramisu::function *func);
63 
65 
66 std::string generate_new_computation_name();
67 
68 isl_map *isl_map_add_free_var(const std::string &free_var_name, isl_map *map, isl_ctx *ctx);
69 void split_string(std::string str, std::string delimiter, std::vector<std::string> &vector);
72 
73 enum xfer_attr {
78  MPI,
84 };
85 
86 struct xfer {
90 };
91 
92 /**
93  * Loop level, loop start, loop end, number of elements to collapse
94  */
95 typedef std::tuple<int, tiramisu::expr, tiramisu::expr, tiramisu::expr> collapse_group;
96 
97 
98 //*******************************************************
99 
100 /**
101  * Initialize the Tiramisu compiler and set Tiramisu options to default
102  * values.
103  * \p fct_name is the name of the function to be generated by the Tiramisu compiler.
104  * This is the name that the user will call to execute the generated code.
105  */
106 void init(std::string name);
107 
108 /**
109  * \overload
110  */
111 void init();
112 
113 /**
114  * \brief Generate code.
115  *
116  * \details
117  *
118  * This function generates the declared function and computations in an object file.
119  * \p obj_filename is the relative path of the object file to be generated.
120  * If \p gen_cuda_stmt is set to true, CUDA code is generated instead of code
121  * targeting CPU (i.e., instead of generating Halide IR then LLVM IR).
122  */
123 void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const bool gen_cuda_stmt = false);
124 
125 //*******************************************************
126 
127 /**
128  * A class to represent functions in Tiramisu. A function in Tiramisu is composed of
129  * a set of computations (tiramisu::computation).
130  */
131 class function
132 {
133  // Friend classes. They can access the private members of the "function" class.
134  friend buffer;
135  friend computation;
136  friend constant;
137  friend generator;
138  friend tiramisu::wait;
139  friend cuda_ast::generator;
140 
141 private:
142  /**
143  * The name of the function.
144  * Function names should not start with _ (an underscore).
145  * Names starting with _ are reserved names.
146  */
147  std::string name;
148 
149  /**
150  * True if this requires a call to MPI_Comm_rank or something similar.
151  */
152  bool _needs_rank_call;
153 
154  /**
155  * Offset the rank by this amount.
156  */
157  int rank_offset = 0;
158 
159  /**
160  * Function arguments. These are the buffers or scalars that are
161  * passed to the function.
162  */
163  std::vector<tiramisu::buffer *> function_arguments;
164 
165  /**
166  * A vector representing the invariants of the function (symbolic
167  * constants or variables that are invariant to the function i.e.
168  * do not change their value during the execution of the function).
169  */
170  std::vector<tiramisu::constant> invariants;
171 
172  /**
173  * An ISL context associate with the function.
174  */
175  isl_ctx *ctx;
176 
177  /**
178  * An ISL AST representation of the function.
179  * The ISL AST is generated by calling gen_isl_ast().
180  */
181  isl_ast_node *ast;
182 
183  /**
184  * A vector representing the parallel dimensions around
185  * the computations of the function.
186  * A parallel dimension is identified using the pair
187  * <computation_name, level>, for example the pair
188  * <S0, 0> indicates that the loop with level 0
189  * (i.e. the outermost loop) around the computation S0
190  * should be parallelized.
191  */
192  std::vector<std::pair<std::string, int>> parallel_dimensions;
193 
194  /**
195  * A vector representing the vectorized dimensions around
196  * the computations of the function.
197  * A vector dimension is identified using the tuple
198  * <computation_name, level, length>, for example the tuple
199  * <S0, 0, 4> indicates that the loop with level 0
200  * (i.e. the outermost loop) around the computation S0
201  * should be vectorized with a vector length 4.
202  */
203  std::vector<std::tuple<std::string, int, int>> vector_dimensions;
204 
205  /**
206  * A vector representing the distributed dimensions around
207  * the computations of the function.
208  * A distributed dimension is identified using the pair
209  * <computation_name, level>, for example the pair
210  * <S0, 0> indicates that the loop with level 0
211  * (i.e. the outermost loop) around the computation S0
212  * should be distributed.
213  */
214  std::vector<std::pair<std::string, int>> distributed_dimensions;
215 
216  /**
217  * A vector representing the GPU block dimensions around
218  * the computations of the function.
219  * GPU dimensions dimensions are dimensions that should be mapped
220  * to parallel GPU dimensions.
221  * GPU dimensions are identified using the tuple
222  * <computation_name, level0, level1, level2>, for example the tuple
223  * <S0, 0, 1, 2> indicates that the loops with levels 0, 1 and 2
224  * (i.e. the three outermost loops) around the computation S0
225  * should be mapped to GPU dimensions.
226  * Level1 must be the level following level0, i.e.
227  * level1 == level0 + 1
228  * and level2 must be the level following level1
229  */
230  std::vector<std::pair<std::string, std::tuple<int, int, int>>> gpu_block_dimensions;
231 
232  /**
233  * Similar to gpu_dimensions but used to store information about GPU thread dimensions.
234  * i.e., dimensions that should mapped to threads (in CUDA terminology).
235  */
236  std::vector<std::pair<std::string, std::tuple<int, int, int>>> gpu_thread_dimensions;
237 
238  /**
239  * A vector representing the dimensions that should be unrolled
240  * around the computations of the function.
241  * Unrolled dimensions are identified using the tuple
242  * <computation_name, level0>, for example the tuple
243  * <S0, 1> indicates that the loops with level 1
244  * around the computation S0 should be unrolled.
245  */
246  std::vector<std::pair<std::string, int>> unroll_dimensions;
247 
248  /**
249  * Body of the function (a vector of computations).
250  * The order of the computations in the vector does not have any
251  * effect on the actual order of execution of the computations.
252  * The order of execution of computations is specified through the
253  * schedule.
254  */
255  std::vector<computation *> body;
256 
257  /**
258  * A Halide statement that represents the whole function.
259  * This value stored in halide_stmt is generated by the code generator
260  * and is empty before calling the code generator.
261  */
262  Halide::Internal::Stmt halide_stmt;
263 
264  /**
265  * A map representing the buffers of the function. Some of these
266  * buffers are passed to the function as arguments and some are
267  * declared and allocated within the function itself.
268  */
269  std::map<std::string, tiramisu::buffer *> buffers_list;
270 
271  /**
272  * The context set of the function, i.e. a set representing the
273  * constraints over the parameters.
274  * The parameters of a function are the function invariants (constants).
275  */
276  isl_set *context_set;
277 
278  /**
279  * The names of the iterators.
280  */
281  std::vector<std::string> iterator_names;
282 
283  /**
284  * Map from loop iterators to CUDA kernels.
285  */
286  std::unordered_map<isl_ast_node*, cuda_ast::kernel_ptr> iterator_to_kernel_map;
287 
288  std::shared_ptr<cuda_ast::compiler> nvcc_compiler;
289 
290  /**
291  * Tag the dimension \p dim of the computation \p computation_name to
292  * be parallelized.
293  * The dimension 0 represents the outermost loop level (it
294  * corresponds to the leftmost dimension in the iteration space).
295  */
296  void add_parallel_dimension(std::string computation_name, int vec_dim);
297 
298  /**
299  * Tag the dimension \p dim of the computation \p computation_name to
300  * be vectorized. \p len is the vector length.
301  * The dimension 0 represents the outermost loop level (it
302  * corresponds to the leftmost dimension in the iteration space).
303  */
304  void add_vector_dimension(std::string computation_name, int vec_dim, int len);
305 
306  /**
307  * Tag the dimension \p dim of the computation \p computation_name to
308  * be distributed.
309  * The dimension 0 represents the outermost loop level (it
310  * corresponds to the leftmost dimension in the iteration space).
311  */
312  void add_distributed_dimension(std::string computation_name, int dim);
313 
314  /**
315  * Tag the loop level \p L of the computation
316  * \p computation_name to be unrolled.
317  * The dimension 0 represents the outermost loop level (it
318  * corresponds to the leftmost dimension in the iteration space).
319  */
320  void add_unroll_dimension(std::string stmt_name, int L);
321 
322  /**
323  * Get live in/out computations in the function.
324  */
325  // @{
326  std::vector<tiramisu::computation *> get_live_in_computations();
327  std::vector<tiramisu::computation *> get_live_out_computations();
328  // @}
329 
330  /**
331  * Recursive function to perform the DFS step of dump_sched_graph.
332  */
333  void dump_sched_graph_dfs(tiramisu::computation *,
334  std::unordered_set<tiramisu::computation *> &);
335 
336  /**
337  * Recursive function to perform the DFS step of is_sched_graph_tree.
338  */
339  bool is_sched_graph_tree_dfs(tiramisu::computation *,
340  std::unordered_set<tiramisu::computation *> &);
341  /**
342  * This functions iterates over the iteration domain of the computations
343  * of the function and computes the maximum dimension among the dimensions
344  * of these iteration domains.
345  */
346  int get_max_iteration_domains_dim() const;
347 
348 
349  /**
350  * This functions iterates over the schedules of the function (the schedule
351  * of each computation in the function) and computes the maximal dimension
352  * among the dimensions of the ranges of all the schedules.
353  */
354  int get_max_schedules_range_dim() const;
355 
356  /**
357  * This function first computes the identity schedules,
358  * then it computes the maximal dimension among the dimensions
359  * of the ranges of all the identity schedules.
360  */
361  int get_max_identity_schedules_range_dim() const;
362 
363  /**
364  * Get the trimmed time-processor domain of the function.
365  * The first dimension of the time-processor domain is used
366  * to indicate duplicates of the computation. Computations that
367  * do not have any duplicate have 0 in that dimension, whereas
368  * computations that have duplicates (i.e., are recomputed) have
369  * a number in that dimension to represent each duplicate.
370  * The trimmed time-processor domain is the time-processor domain
371  * without the dimension that represents the duplicates. We simply
372  * take the time-processor domain and remove the first dimension
373  * used to represent the duplicates.
374  */
375  isl_union_set *get_trimmed_time_processor_domain() const;
376 
377  /**
378  * This function iterates over the computations of the function.
379  * It modifies the identity schedule of each computation in order to
380  * make all the identity schedules have the same number of dimensions
381  * in their ranges.
382  * This is done by adding dimensions equal to 0 to the range of each
383  * identity schedule that does not have enough dimensions.
384  */
385  isl_union_map *get_aligned_identity_schedules() const;
386 
387  /**
388  * A pass to rename computations.
389  * Computation that are defined multiple times need to be renamed, because
390  * those computations in general have different expressions and the code
391  * generator expects that computations that have the same name always have
392  * the same expression and access relation. So we should rename them to avoid
393  * any ambiguity for the code generator.
394  *
395  */
396  void rename_computations();
397 
398 
399 protected:
400 
401  /**
402  * Add a buffer to the function.
403  * The buffers of the function are either:
404  * - buffers passed to the function as arguments, or
405  * - buffers that are declared and allocated within the function
406  * itself.
407  * The first element of the pair is the name of the buffer (it is
408  * used as a key), the second element of the pair is a pointer
409  * to the buffer.
410  */
411  void add_buffer(std::pair<std::string, tiramisu::buffer *> buf);
412 
413  /**
414  * Add a computation to the function. The order in which
415  * computations are added to the function is not important.
416  * The order of execution is specified using the schedule.
417  * This doesn't allow computations with duplicate names.
418  */
419  void add_computation(computation *cpt);
420 
421  /**
422  * Tag the dimensions \p dim0, \p dim1 and \p dim2 of the computation
423  * \p computation_name to be mapped to GPU blocks.
424  * The dimension 0 represents the outermost loop level (it
425  * corresponds to the leftmost dimension in the iteration space).
426  *
427  * If the user does not want to tag \p dim1 or \p dim2, he can leave
428  * their values to default value (i.e., -1). They will not be tagged.
429  *
430  * For example
431  *
432  * add_gpu_block_dimensions("S0", 1, 2);
433  *
434  * Will tag the dimensions 1 and 2 to be transformed to GPU blocks.
435  */
436  void add_gpu_block_dimensions(std::string stmt_name, int dim0, int dim1 = -1, int dim2 = -1);
437 
438  /**
439  * Tag the dimensions \p dim0, \p dim1 and \p dim2 of the computation
440  * \p computation_name to be mapped to GPU threads.
441  * The dimension 0 represents the outermost loop level (it
442  * corresponds to the leftmost dimension in the iteration space).
443  *
444  * If the user does not want to tag \p dim1 or \p dim2, he can leave
445  * their values to default value (i.e., -1). They will not be tagged.
446  *
447  * For example
448  *
449  * add_gpu_block_dimensions("S0", 1, -1, -1);
450  *
451  * Will tag the dimension 1 to be transformed to GPU threads.
452  */
453  void add_gpu_thread_dimensions(std::string stmt_name, int dim0, int dim1 = -1, int dim2 = -1);
454 
455  /**
456  * Add an invariant to the function.
457  */
458  void add_invariant(tiramisu::constant param);
459 
460  /**
461  * Add an iterator to the function.
462  */
463  void add_iterator_name(const std::string &it_name);
464 
465  /**
466  * Keeps track of allocation computations created using
467  * allocate_and_map_buffer_automatically() to schedule them during gen_ordering_schedules.
468  */
469  std::vector<tiramisu::computation *> automatically_allocated;
470 
471  /**
472  * Compute the graph of dependences between the computations of
473  * the function.
474  *
475  * Example
476  *
477  * C[0] = 0
478  * D[1] = C[0]
479  * D[2] = C[0]
480  * {C[0] -> D[1]; C[0]->D[2]}
481  */
482  isl_union_map *compute_dep_graph();
483 
484  /**
485  * Get the arguments of the function.
486  */
487  const std::vector<tiramisu::buffer *> &get_arguments() const;
488 
489  /**
490  * Return a map that represents the buffers of the function.
491  * The buffers of the function are buffers that are either passed
492  * to the function as arguments or are buffers that are declared
493  * and allocated within the function itself.
494  * The names of the buffers are used as a key for the map.
495  */
496  const std::map<std::string, tiramisu::buffer *> &get_buffers() const;
497 
498  /**
499  * Return a vector of the computations of the function.
500  * The order of the computations in the vector does not have any
501  * effect on the actual order of execution of the computations.
502  * The order of execution of computations is specified through the
503  * schedule.
504  */
505  const std::vector<computation *> &get_computations() const;
506 
507  /**
508  * Return the computation of the function that has
509  * the name \p str.
510  */
511  std::vector<computation *> get_computation_by_name(std::string str) const;
512 
513  /**
514  * Return a string representing the name of the GPU block iterator at
515  * dimension \p lev0.
516  * This function only returns a non-empty string if the
517  * computation \p comp is mapped to a GPU block at the dimension \p lev0.
518  */
519  std::string get_gpu_block_iterator(const std::string &comp, int lev0) const;
520 
521  /**
522  * Return a string representing the name of the GPU thread iterator at
523  * dimension \p lev0.
524  * This function only returns a non-empty string if the
525  * computation \p comp is mapped to a GPU thread at the dimension \p lev0.
526  */
527  std::string get_gpu_thread_iterator(const std::string &comp, int lev0) const;
528 
529  /**
530  * Return the isl_ctx associated with this function.
531  * This is an ISL specific object required when calling certain
532  * ISL functions. It does not represent the set of parameters
533  * of the function (which should be retrieved by calling
534  * get_program_context()).
535  */
536  isl_ctx *get_isl_ctx() const;
537 
538  /**
539  * Return the Halide statement that represents the whole
540  * function.
541  * The Halide statement is generated by the code generator.
542  * This function should not be called before calling the code
543  * generator.
544  */
545  Halide::Internal::Stmt get_halide_stmt() const;
546 
547  /**
548  * Return a vector representing the invariants of the function
549  * (symbolic constants or variables that are invariant to the
550  * function i.e. do not change their value during the execution
551  * of the function).
552  *
553  * get_invariant_names() returns the names of the variants.
554  */
555  //@{
556  const std::vector<tiramisu::constant> &get_invariants() const;
557  const std::vector<std::string> get_invariant_names() const;
558  //@}
559 
560  /**
561  * Return an ISL AST that represents this function.
562  * This function itself does not generate the ISL AST, it just
563  * returns it if it already exists.
564  * The function gen_isl_ast() should be called before calling
565  * this function.
566  */
567  isl_ast_node *get_isl_ast() const;
568 
569  /**
570  * Return the union of all the iteration domains
571  * of the computations of the function.
572  */
573  isl_union_set *get_iteration_domain() const;
574 
575  /**
576  * Get the iterator names of the function.
577  */
578  const std::vector<std::string> &get_iterator_names() const;
579 
580  /**
581  * Get the name of the function.
582  */
583  const std::string &get_name() const;
584 
585  /**
586  * Return a set that represents the parameters of the function
587  * (an ISL set that represents the parameters and constraints over
588  * the parameters of the functions, a parameter is an invariant
589  * of the function). This set is also known as the context of
590  * the program.
591  * An example of a context set is the following:
592  * "[N,M]->{: M>0 and N>0}"
593  * This context set indicates that the two parameters N and M
594  * are strictly positive.
595  */
596  isl_set *get_program_context() const;
597 
598  /**
599  * Return the union of all the schedules
600  * of the computations of the function.
601  */
602  isl_union_map *get_schedule() const;
603 
604  /**
605  * Return the union of all the trimmed schedules of the
606  * function.
607  * A trimmed schedule is the schedule without the duplication
608  * dimension (the schedule dimension used to indicate duplicate
609  * computations).
610  */
611  isl_union_map *get_trimmed_schedule() const;
612 
613  /**
614  * Return the union of time-processor domains of each
615  * computation in the function.
616  * In the time-processor representation, the logical time of
617  * execution and the processor where the computation will be
618  * executed are both specified.
619  */
620  isl_union_set *get_time_processor_domain() const;
621 
622  /**
623  * If the computation \p comp is vectorized, return its vector length
624  * at the loop level \p lev.
625  */
626  int get_vector_length(const std::string &comp, int lev) const;
627 
628  /**
629  * Return true if the usage of high level scheduling comments is valid; i.e. if
630  * the scheduling relations formed using before, after, compute_at, etc.. form a tree.
631  *
632  * More specifically, it verifies that:
633  * - There should be exactly one computation with no computation scheduled before it.
634  * - Each other computation should have exactly one computation scheduled before it.
635  */
636  bool is_sched_graph_tree();
637 
638  /**
639  * Modify the schedules of the computations of this function to reflect
640  * the order specified using the high level scheduling commands.
641  *
642  * Commands like .after() and .before() do not directly modify the schedules
643  * but rather modify the sched_graph graph.
644  */
645  void gen_ordering_schedules();
646 
647  /**
648  * Stores all high level scheduling instructions between computations; i.e. if a user calls
649  * for example c2.after(c1, L), sched_graph[&c1] would contain the key &c2, and
650  * sched_graph[&c1][&c2] = L.
651  * At the end of scheduling, the graph should respect the following rules:
652  * - There should be exactly one computation with no computation scheduled before it.
653  * - Each other computation should have exactly one computation scheduled before it.
654  * In other words, the graph should be a valid tree.
655  * Does not include allocation computations created using
656  * allocate_and_map_buffer_automatically().
657  */
658  std::unordered_map<tiramisu::computation *,
659  std::unordered_map<tiramisu::computation *, int>> sched_graph;
660 
661  /**
662  * Same as sched_graph, except in reverse order (from after to before).
663  */
664  std::unordered_map<tiramisu::computation *,
665  std::unordered_map<tiramisu::computation *, int>> sched_graph_reversed;
666 
667  /**
668  * Set the iterator names of the function.
669  * This function overrides any previously set iterator names.
670  */
671  void set_iterator_names(const std::vector<std::string> &it_names);
672 
673  /**
674  * Return true if the computation \p comp should be mapped to GPU block
675  * at the loop levels \p lev0.
676  */
677  bool should_map_to_gpu_block(const std::string &comp, int lev0) const;
678 
679  /**
680  * Return true if the computation \p comp should be mapped to GPU thread
681  * at the loop levels \p lev0.
682  */
683  bool should_map_to_gpu_thread(const std::string &comp, int lev0) const;
684 
685  /**
686  * Return true if the computation \p comp should be parallelized
687  * at the loop level \p lev.
688  */
689  bool should_parallelize(const std::string &comp, int lev) const;
690 
691  /**
692  * Return true if the computation \p comp should be unrolled
693  * at the loop level \p lev.
694  */
695  bool should_unroll(const std::string &comp, int lev) const;
696 
697  /**
698  * Return true if the computation \p comp should be vectorized
699  * at the loop level \p lev.
700  */
701  bool should_vectorize(const std::string &comp, int lev) const;
702 
703  /**
704  * Return true if the computation \p comp should be distributed
705  * at the loop level \p lev.
706  */
707  bool should_distribute(const std::string &comp, int lev) const;
708 
709  /**
710  * This computation requires a call to the MPI_Comm_rank function.
711  */
712  bool needs_rank_call() const;
713 
714  /**
715  * Lift certain computations for distributed execution to function calls.
716  */
717  void lift_mpi_comp(tiramisu::computation *comp);
718 
719  /**
720  * Lift certain computations for distributed execution to function calls.
721  */
722  void lift_dist_comps();
723 
724  /**
725  * The set of all computations that have no computation scheduled before them.
726  * Does not include allocation computations created using
727  * allocate_and_map_buffer_automatically().
728  */
729  std::unordered_set<tiramisu::computation *> starting_computations;
730 
731  /**
732  * A boolean set to true if low level scheduling was used in the program.
733  * If it is used, then high level scheduling commands such as .before(),
734  * .after(), ...
735  */
737 
738 public:
739 
740  /**
741  * \brief Construct a function called \p name.
742  * \details Function names should not start with _ (an underscore).
743  * Names starting with _ are reserved names.
744  */
745  function(std::string name);
746 
747  /**
748  * \brief Add a set of constraints to the context of the program.
749  *
750  * \details This command is useful for providing constraints over the constants
751  * used within a tiramisu function.
752  * The context of a function is represented as an ISL set that represents
753  * constraints over the parameters of the function (a parameter of a
754  * function is a constant used in that function).
755  *
756  * For example, if the constants M and N are known to be positive, it is beneficial
757  * to provide such an information to the Tiramisu compiler as follows:
758  * f.add_context_constraints("[N,M]->{: M>0 and N>0}");
759  * This context set indicates that the two parameters N and M
760  * are strictly positive in the function f.
761  *
762  * \p new_context should have the same space as the context set.
763  * This call intersects the set \p new_context
764  * (input) with the context of the function.
765  */
766  void add_context_constraints(const std::string &new_context);
767 
768  /**
769  * \brief Align the schedules of all the computations of this
770  * function.
771  *
772  * This method applies to the schedule of each computation
773  * in the function. It makes the dimensions of the ranges of
774  * all the schedules equal. This is done by adding dimensions
775  * equal to 0 to the range of each schedule.
776  * This function is called automatically when gen_isl_ast()
777  * or gen_time_processor_domain() are called.
778  */
779  void align_schedules();
780 
781  /**
782  * \brief For each computation, allocate a buffer and map the computation
783  * to that buffer.
784  *
785  * \details For each computation in the function:
786  * - Allocate a buffer where the size of the buffer is derived automatically.
787  * Assuming the name of the computation is C, the name of the generated buffer
788  * is _C_buffer.
789  * - Map the computation to the allocated buffer (one-to-one mapping).
790  * For more details about one-to-one mapping, see computation::store_in.
791  */
792  void allocate_and_map_buffers_automatically();
793 
794  /**
795  * \brief Compute the bounds of each computation.
796  *
797  * \details Computing the bounds of each computation means computing
798  * the constraints over the iteration domains of each computation in
799  * the function.
800  *
801  * In order to deduce bounds, Tiramisu first identifies the final consumers
802  * in the function (i.e., computations that do not have any consumer).
803  * Then, it propagates the bounds over the final consumers to their producers.
804  * The bounds of each consumer are used to deduce the bounds over its producer.
805  *
806  * To take benefit of bound inference, the user can declare computations
807  * without providing constraints on their iteration domains. For example
808  * the user can declare the following computations (the left side is the
809  * iteration domain, while the right side is the expression attached to
810  * each computation)
811  *
812  * \code
813  * {A[i] } : 0
814  * {B[i] } : 0
815  * {C[i] } : A[i] + B[i]
816  * {D[i]: 0<=i<N} : 2*C[i]
817  * \endcode
818  *
819  *
820  * The user needs only to provide constraints over the domains of the
821  * last computations (last consumers), and Tiramisu will propagate
822  * these constraints to all the chain of computations that produce for
823  * those consumers.
824  * In the previous example, constraints over the iteration domain were
825  * only provided for the last consumer "D[i]" and no constraints were
826  * provided for the other computations. Bound inference would deduce
827  * the constraints for the computations A[i], B[i] and C[i].
828  *
829  * Note that bound inference is not possible if you have multiple definitions
830  * of the same computation. For example, if you have multiple definitions
831  * of the same computations, in such a case the user should provide
832  * constraints of the iteration domain of the computation. Example:
833  *
834  * \code
835  * {A[i] } : 0
836  * {C[i]: i=0 } : 0
837  * {C[i]: 1<=i<N} : C[i-1] + A[i]
838  * {D[i]: 0<=i<N} : 2*C[i]
839  * \endcode
840  *
841  * In this case, constraints over the computations defining C[i] should be
842  * provided.
843  */
844  void compute_bounds();
845 
846  /**
847  * \brief Dump the function on standard output (dump most of the fields of
848  * tiramisu::function).
849  * \details This is mainly useful for debugging.
850  * If \p exhaustive is set to true, all the fields of the function
851  * class are printed.
852  */
853  void dump(bool exhaustive) const;
854 
855  /**
856  * \brief Dump the graph of dependences between computations.
857  * \details The graph of dependences is a union of maps (relations) from producers
858  * to consumers.
859  */
860  void dump_dep_graph();
861 
862  /**
863  * \brief Dump a Halide stmt that represents the function.
864  * \details tiramisu::function::gen_halide_stmt should be called before calling
865  * this function.
866  */
867  void dump_halide_stmt() const;
868 
869  /**
870  * \brief Dump the iteration domain of the function.
871  * \details This is mainly useful for debugging.
872  */
873  void dump_iteration_domain() const;
874 
875  /**
876  * \brief Dump the schedules of the computations of the function.
877  * \details This function is mainly useful for debugging.
878  * See tiramisu::computations::set_low_level_schedule for details about the
879  * schedule.
880  */
881  void dump_schedule() const;
882 
883  /**
884  * \brief Dumps the graph of scheduling relations set by the higher level scheduling
885  * functions (e.g. after, before, compute_at...).
886  * \details This is mainly useful for debugging.
887  * This function can be called at any point during scheduling.
888  */
889  void dump_sched_graph();
890 
891  /**
892  * \brief Dump (on stdout) the time processor domain of the function.
893  * \details The time-processor domain should be generated using
894  * tiramisu::function::gen_time_processor_domain before calling this
895  * function.
896  * This is mainly useful for debugging.
897  */
898  void dump_time_processor_domain() const;
899 
900  /**
901  * \brief Dump (on stdout) the trimmed time processor domain of the function.
902  * \details The time-processor domain should be generated using
903  * tiramisu::function::gen_time_processor_domain before calling this function.
904  * This is mainly useful for debugging.
905  * The difference between the time-processor domain and the trimmed
906  * time-processor domain is that the trimmed one does not have the
907  * duplicate dimension. We remove it before printing.
908  * The trimmed time-processor domain is the domain used for code
909  * generation.
910  */
911  void dump_trimmed_time_processor_domain() const;
912 
913  /**
914  * \brief Generate C code and print it on stdout.
915  * \details Currently C code code generation is very basic and does not
916  * support many features compared to the Halide code generator.
917  * Use this for debugging only.
918  */
919  void gen_c_code() const;
920 
921  /**
922  * \brief Generate an object file that contains the compiled function.
923  * \details This function relies on Halide to generate the object file.
924  *
925  * \p obj_file_name indicates the name of the generated file.
926  *
927  * \p os indicates the target operating system (Halide::Target::OS).
928  *
929  * \p arch indicates the architecture of the target (the instruction set).
930  *
931  * \p bits indicate the bit-width of the target machine.
932  * must be 0 for unknown, or 32 or 64.
933  * For a full list of supported values for \p os and \p arch please
934  * check the documentation of Halide::Target
935  * (http://halide-lang.org/docs/struct_halide_1_1_target.html).
936  * If the machine parameters are not supplied, Halide detects
937  * the parameters of the host machine automatically.
938 
939  */
940  void gen_halide_obj(const std::string &obj_file_name, Halide::Target::OS os,
941  Halide::Target::Arch arch, int bits) const;
942 
943  /**
944  * \overload
945  */
946  void gen_halide_obj(const std::string &obj_file_name) const;
947 
948  /**
949  * Generate a Halide stmt that represents the function.
950  */
951  void gen_halide_stmt();
952 
953  void gen_cuda_stmt();
954 
955  /**
956  * Generate an isl AST that represents the function.
957  */
958  void gen_isl_ast();
959 
960  /**
961  * Generate the time-space domain of the function.
962  *
963  * In this representation, the logical time of execution and the
964  * processor where the computation will be executed are both
965  * specified.
966  */
967  void gen_time_space_domain();
968 
969  /**
970  * Set the arguments of the function.
971  * The arguments of the function are provided as a vector of
972  * pointers to buffers. Each buffer represents an argument
973  * to the function.
974  * During code generation, the arguments in the vector will become
975  * the arguments of the generated function (with the order of their
976  * appearance in the vector).
977  */
978  void set_arguments(const std::vector<tiramisu::buffer *> &buffer_vec);
979 
980  /**
981  * Wrapper for all the functions required to run code generation of a
982  * tiramisu program.
983  */
984  void codegen(const std::vector<tiramisu::buffer *> &arguments, const std::string obj_filename, const bool gen_cuda_stmt = false);
985 
986  /**
987  * \brief Set the context of the function.
988  * \details A context is an ISL set that represents constraints over the
989  * parameters of the functions (a parameter is an invariant variable for
990  * the function).
991  * An example of a context set is the following:
992  * "[N,M]->{: M>0 and N>0}"
993  * This context set indicates that the two parameters N and M
994  * are strictly positive.
995  *
996  * This function takes a string that represents and ISL set.
997  */
998  void set_context_set(const std::string &context);
999 
1000  /**
1001  * \overload
1002  *
1003  * This function takes an ISL set as input.
1004  */
1005  void set_context_set(isl_set *context);
1006 
1007 };
1008 
1009 
1010 /**
1011  * A class that represents buffers.
1012  *
1013  * Buffers have two use cases:
1014  * - used to store the results of computations, and
1015  * - used to represent input arguments to functions.
1016  */
1017 class buffer
1018 {
1019  friend computation;
1020  friend function;
1021  friend generator;
1022  friend cuda_ast::generator;
1023 
1024 private:
1025  /**
1026  * A boolean that indicates whether a buffer is allocated or not.
1027  */
1028  bool allocated;
1029 
1030  /**
1031  * Type of the argument (if the buffer is an argument):
1032  * Three possible types:
1033  * - a_input: for inputs of the function,
1034  * - a_output: for outputs of the function,
1035  * - a_temporary: for buffers used as temporary buffers within
1036  * the function (any temporary buffer is allocated automatically by
1037  * the Tiramisu runtime at the entry of the function and is
1038  * deallocated at the exit of the function).
1039  */
1040  tiramisu::argument_t argtype;
1041 
1042  /**
1043  * A boolean indicating whether the buffer should be allocated
1044  * automatically by Tiramisu.
1045  */
1046  bool auto_allocate;
1047 
1048  /**
1049  * The sizes of the dimensions of the buffer. Assuming the following
1050  * buffer buf[N0][N1][N2], dim_sizes should be {N0, N1, N2}.
1051  */
1052  std::vector<tiramisu::expr> dim_sizes;
1053 
1054  /**
1055  * The tiramisu function where this buffer is declared or where the
1056  * buffer is an argument.
1057  */
1058  tiramisu::function *fct;
1059 
1060  /**
1061  * The name of the buffer.
1062  * Buffer names should not start with _ (an underscore).
1063  * Names starting with _ are reserved names.
1064  */
1065  std::string name;
1066 
1067  /**
1068  * The type of the elements of the buffer.
1069  */
1070  tiramisu::primitive_t type;
1071 
1072  /**
1073  * The location of the buffer (host if in memory, else where on GPU).
1074  */
1075  cuda_ast::memory_location location;
1076 
1077 protected:
1078  /**
1079  * Set the type of the argument. Three possible types exist:
1080  * - a_input: for inputs of the function,
1081  * - a_output: for outputs of the function,
1082  * - a_temporary: for buffers used as temporary buffers within
1083  * the function (any temporary buffer is allocated automatically by
1084  * the Tiramisu runtime at the entry of the function and is
1085  * deallocated at the exit of the function).
1086  */
1087  void set_argument_type(tiramisu::argument_t type);
1088 
1089  /**
1090  * Return whether the buffer should be allocated automatically.
1091  */
1092  bool get_auto_allocate();
1093 
1094  /**
1095  * Set the size of a dimension of the buffer.
1096  */
1097  void set_dim_size(int dim, int size);
1098 
1099 public:
1100  /**
1101  * \brief Create a tiramisu buffer.
1102  *
1103  * \details
1104  *
1105  * A Tiramisu buffer is equivalent to an array in C.
1106  *
1107  * Buffers have two use cases:
1108  * - Used to store the results of computations, and
1109  * - Used to represent input arguments to functions.
1110  *
1111  * \p name is the name of the buffer.
1112  *
1113  * \p dim_sizes is a vector of tiramisu expressions that represent the
1114  * size of each dimension in the buffer.
1115  * Assuming we want to declare the buffer buf[N0][N1][N2],
1116  * then the vector of sizes should be {N0, N1, N2}.
1117  * Buffer dimensions in Tiramisu have the same semantics as in
1118  * C/C++.
1119  *
1120  * \p type is the type of the elements of the buffer.
1121  * It must be a primitive type (i.e. p_uint8, p_uint16, ...).
1122  * Possible types are declared in \ref tiramisu::primitive_t
1123  * (in type.h).
1124  *
1125  * \p argt indicates whether this buffer is an input or an output
1126  * buffer and whether it should be allocated automatically by Tiramisu.
1127  * The possible types (\ref tiramisu::argument_t) are:
1128  *
1129  * - tiramisu::a_input: indicates that the buffer is supposed
1130  * to be passed as input to the function generated by Tiramisu.
1131  *
1132  * - tiramisu::a_output: indicates that the buffer is supposed
1133  * to be passed as output to the function generated by Tiramisu.
1134  * Input an output buffers should be allocated by the user
1135  * before calling the Tiramisu generated function. Tiramisu
1136  * does not allocated memory for input and output buffers.
1137  *
1138  * - tiramisu::a_temporary: this buffer is not supposed to
1139  * be passed to the function as an argument. It will be
1140  * allocated automatically by the Tiramisu compiler at
1141  * the beginning of the function and deallocated at the end
1142  * of the function. The user can also allocate the buffer
1143  * "manually" by setting the automatic allocation to false for
1144  * this buffer.
1145  *
1146  * \p fct is a pointer to a Tiramisu function where the buffer is
1147  * declared or used. If this argument is not provided (which is
1148  * the common case), the function that was created automatically
1149  * during Tiramisu initialization will be used (we call that
1150  * function the "implicit function").
1151  *
1152  * Buffer names should not start with _ (an underscore).
1153  * Names starting with _ are reserved names.
1154  */
1155  buffer(std::string name, std::vector<tiramisu::expr> dim_sizes,
1158 
1159  /**
1160  * \brief Indicate when to allocate the buffer (i.e., the schedule).
1161  *
1162  * \details The buffer is allocated in the same loop of the computation \p C
1163  * at the loop level \p level (but the order between the two is not
1164  * specified).
1165  *
1166  * For example, let's assume that buf0 is a buffer, and let's assume
1167  * that we have three computations C1, C2 and C3 scheduled as follow
1168  *
1169  * \code
1170  * for (i=0; i<N; i++)
1171  * for (j=0; j<N; j++)
1172  * for (k=0; k<N; k++)
1173  * C1;
1174  *
1175  * for (i=0; i<N; i++) // level 0
1176  * for (j=0; j<N; j++) // level 1
1177  * for (k=0; k<N; k++) // level 2
1178  * {
1179  * C2;
1180  * C3;
1181  * }
1182  * \endcode
1183  *
1184  * The following Tiramisu code
1185  *
1186  * \code
1187  * tiramisu::computation *C4 = buf0.allocate_at(C2, j);
1188  * C4->before(C2, j);
1189  * \endcode
1190  *
1191  * would allocate buf0 in the loop surrounding C2 at the loop
1192  * level 0. The allocation computation is called C4, where
1193  * C4 is scheduled to execute before C2 at the loop level 1.
1194  * The generated code would look like the following code:
1195  *
1196  * \code
1197  * for (i=0; i<N; i++)
1198  * for (j=0; j<N; j++)
1199  * for (k=0; k<N; k++)
1200  * C1;
1201  *
1202  * for (i=0; i<N; i++) // level 0
1203  * for (j=0; j<N; j++) // level 1
1204  * {
1205  * allocate(buf0, buffer_size, buffer_type);
1206  * for (k=0; k<N; k++) // level 2
1207  * {
1208  * C2;
1209  * C3;
1210  * }
1211  * }
1212  * \endcode
1213  *
1214  */
1215  //@{
1217  tiramisu::computation *allocate_at(tiramisu::computation &C, int level);
1218  //@}
1219 
1220  /**
1221  * \brief Dump the function on standard output.
1222  * \details This functions dumps most of the fields of the buffer class
1223  * but not all of them. It is mainly useful for debugging.
1224  * If \p exhaustive is set to true, all the fields of the buffer
1225  * class are printed.
1226  */
1227  void dump(bool exhaustive) const;
1228 
1229  /**
1230  * \brief If this buffer is an argument to a tiramisu::function,
1231  * return the type of the argument
1232  *
1233  * \details Three possible types exist:
1234  * - tiramisu::a_input: for inputs of the function,
1235  * - tiramisu::a_output: for outputs of the function,
1236  * - tiramisu::a_temporary: for buffers used as temporary buffers within
1237  * the function (any temporary buffer is allocated automatically by
1238  * the Tiramisu runtime at the entry of the function and is
1239  * deallocated at the exit of the function).
1240  */
1241  tiramisu::argument_t get_argument_type() const;
1242 
1243  /**
1244  * Return the name of the buffer.
1245  */
1246  const std::string &get_name() const;
1247 
1248  /**
1249  * Get the number of dimensions of the buffer.
1250  */
1251  int get_n_dims() const;
1252 
1253  /**
1254  * Return the type of the elements of the buffer.
1255  */
1256  tiramisu::primitive_t get_elements_type() const;
1257 
1258  /**
1259  * Return the sizes of the dimensions of the buffer.
1260  */
1261  const std::vector<tiramisu::expr> &get_dim_sizes() const;
1262 
1263  /**
1264  * Set whether the buffer should be allocated automatically.
1265  */
1266  void set_auto_allocate(bool auto_allocation);
1267 
1268  /**
1269  * Return true if all extents of the buffer are literal integer
1270  * contants (e.g., 4, 10, 100, ...).
1271  */
1272  bool has_constant_extents();
1273 
1274  /**
1275  * Return true if a statement that allocates the buffer was
1276  * already generated.
1277  */
1278  const bool is_allocated() const;
1279 
1280  /**
1281  * Mark an array as already allocated.
1282  */
1283  void mark_as_allocated();
1284 
1285  /* Tag the buffer as located in the GPU global memory. */
1286  void tag_gpu_global();
1287  /** Tag the buffer as located in the GPU register memory.
1288  * The buffer needs to have a unique dimension of size 1 (i.e. needs to be a scalar).*/
1289  void tag_gpu_register();
1290  /* Tag the buffer as located in the GPU shared memory. */
1291  void tag_gpu_shared();
1292  /* Tag the buffer as located in the GPU local thread memory. */
1293  void tag_gpu_local();
1294  /* Tag the buffer as located in the GPU constant memory. */
1295  void tag_gpu_constant();
1296 
1297 };
1298 
1299 /**
1300  * \brief A class that represents computations.
1301  *
1302  * \details A computation is an expression associated with an iteration domain.
1303  * A computation indicates what needs to be computed (the expression
1304  * that should be computed).
1305  * A computation has three representations:
1306  * - Level I: this level specifies "what" should be computed but does
1307  * not specify "when" (order) and "where" (on which processor) each
1308  * expression should be computed.
1309  * This level also does not specify where computations should be stored
1310  * in memory and in which data layout.
1311  * - Level II: this level specifies "what" should be computed, "when", i.e.
1312  * the order in which the computation should be executed with regard to
1313  * the other computations. And "where" each computation should be
1314  * computed (i.e., on which processor).
1315  * This level still does not specify where computations should be stored
1316  * in memory and their data layout.
1317  * - Level III: this level is similar to Level 2 but it specifies where
1318  * computations should be stored in memory and the data layout.
1319  */
1321 {
1322  friend input;
1323  friend function;
1324  friend generator;
1325  friend buffer;
1326  friend constant;
1327  friend sync;
1328  friend computation_tester;
1329  friend send;
1330  friend recv;
1331  friend tiramisu::wait;
1332  friend cuda_ast::generator;
1333 
1334 private:
1335 
1336  /**
1337  * Access function. A map indicating how each computation should be stored
1338  * in memory. It indicates in which buffer the computation should be stored
1339  * and which element of the buffer exactly it should be stored.
1340  */
1341  isl_map *access;
1342 
1343  /**
1344  * A vector that contains the list of let statements associated
1345  * with this computation.
1346  *
1347  * A let statement that is associated with the computation is a let statement
1348  * that will be added just before the computation. The scope of the variable
1349  * defined by the let statement is this computation alone. i.e., it is not
1350  * defined in other computations.
1351  */
1352  std::vector<std::pair<std::string, tiramisu::expr>> associated_let_stmts;
1353 
1354  /**
1355  * The buffer attached "automatically" to this computation.
1356  * If the buffer is not created automatically, this variable will be empty.
1357  */
1358  tiramisu::buffer *automatically_allocated_buffer;
1359 
1360  /**
1361  * An ISL context associate with the function.
1362  */
1363  isl_ctx *ctx;
1364 
1365  /**
1366  * Data type: the type of the value returned by the computation.
1367  */
1368  tiramisu::primitive_t data_type;
1369 
1370  /**
1371  * Number of definitions added to this computation. Each time the function
1372  * add_definitions is called, definitions_number is incremented.
1373  */
1374  int definitions_number;
1375 
1376  /**
1377  * The ID of this definition. Each new computation when created has a
1378  * definition_ID equal to 0. When a new definition is added, its ID
1379  * is 1, when a new definition is added, its definition ID is set to 2,
1380  * ...
1381  */
1382  int definition_ID;
1383 
1384  /**
1385  * An integer indicating the number of duplicates of this computation.
1386  * We use this to set the duplicate ID of any newly created computation.
1387  * Whenever a new duplicate is create, this number is incremented.
1388  */
1389  int duplicate_number;
1390 
1391  /**
1392  * An expression representing the computation
1393  * ("what" should be computed).
1394  */
1395  tiramisu::expr expression;
1396 
1397  /**
1398  * If has_multiple_definitions() is true, then this variable contains the
1399  * computation that was defined first among all the multiple definitions.
1400  */
1401  tiramisu::computation *first_definition;
1402 
1403  /**
1404  * The function where this computation is declared.
1405  */
1406  tiramisu::function *fct;
1407 
1408  /**
1409  * An isl_ast_expr representing the index of the array where the computation
1410  * will be stored. This index is computed after the scheduling is done.
1411  */
1412  std::vector<isl_ast_expr *> index_expr;
1413 
1414  /**
1415  * A map between the original names of the iterators of a computation
1416  * and their transformed form after schedule (also after renaming).
1417  *
1418  * If in the original computation, we had
1419  *
1420  * {C[i0, i1]: ...}
1421  *
1422  * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
1423  * the loops are tiled, then the map will be
1424  *
1425  * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
1426  */
1427  std::map<std::string, isl_ast_expr *> iterators_map;
1428 
1429  /**
1430  * Does this computation represent a let statement ?
1431  *
1432  * Let statements should be treated differently:
1433  * - During Halide code generation a Halide let statement should be
1434  * created instead of an assignment statement.
1435  * - A let statement does not have/need an access function because
1436  * it writes directly to a scalar.
1437  * - When targeting Halide, let statements should be created after
1438  * their body is created, because the body is an argument needed
1439  * for the creation of the let statement.
1440  */
1441  bool is_let;
1442 
1443  /**
1444  * This is variable is set to true if this computation is the first definition.
1445  * It is set to false if has_multiple_definitions() is true but this computation
1446  * is not the first one defined.
1447  * This is useful because in tiramisu, we assume that all the computations that
1448  * have the same name must have the same memory access. Thus any new definition
1449  * of a computation must copy the same memory access as the first definition, thus
1450  * we need to know which computation is the first definition.
1451  */
1452  bool is_first;
1453 
1454  /**
1455  * Return true if this computation is the first definition.
1456  * It returns false if has_multiple_definitions() is true but this computation
1457  * is not the first one defined.
1458  * This is useful because in tiramisu, we assume that all the computations that
1459  * have the same name must have the same memory access. Thus any new definition
1460  * of a computation must copy the same memory access as the first definition, thus
1461  * we need to know which computation is the first definition.
1462  */
1463  bool is_first_definition();
1464 
1465  /* Is true if the the computation is inline. */
1466  bool is_inline;
1467 
1468  /**
1469  * Iteration domain of the computation.
1470  * In this representation, the order of execution of computations
1471  * is not specified, the computations are also not mapped to memory.
1472  */
1473  isl_set *iteration_domain;
1474 
1475  /**
1476  * The name of this computation.
1477  * Computation names should not start with _ (an underscore).
1478  * Names starting with _ are reserved names.
1479  */
1480  std::string name;
1481 
1482  /* The number of dimensions in the original definition of the computation. */
1483  int number_of_dims;
1484 
1485  /**
1486  * A predicate around the computation. The computation is executed
1487  * only if this predicate is true. This is useful to insert a non-affine
1488  * condition around the computation.
1489  */
1490  tiramisu::expr predicate;
1491 
1492  /**
1493  * The schedules of the computation.
1494  */
1495  isl_map * schedule;
1496 
1497  /**
1498  * TODO: use buffers directly from computations, no need to have
1499  * bindings.
1500  *
1501  * \p schedule_this_computation should be set to true when the computation
1502  * should be scheduled and when code for the computation should be generated
1503  * during code generation.
1504  * It should be set to false when the computation is used to represent a
1505  * buffer (i.e., the computation is used only as a binding to a buffer).
1506  * In this case, the computation is not scheduled and no code for the
1507  * computation is generated.
1508  */
1509  bool schedule_this_computation;
1510 
1511  // Private class members that are computed during code generation.
1512 
1513  /**
1514  * Halide statement that assigns the computation to a buffer location.
1515  */
1516  Halide::Internal::Stmt stmt;
1517 
1518  /**
1519  * Time-processor domain of the computation.
1520  * In this representation, the logical time of execution and the
1521  * processor where the computation will be executed are both
1522  * specified.
1523  */
1524  isl_set *time_processor_domain;
1525 
1526  /**
1527  * True if the computation is executed in a nonblocking or asynchronous way.
1528  */
1529  bool _is_nonblock_or_async;
1530 
1531  /**
1532  * True if the rank should be dropped from index linearization.
1533  */
1534  bool _drop_rank_iter;
1535 
1536  /**
1537  * If _drop_rank_iter == true, this is the level to drop
1538  */
1539  var drop_level;
1540 
1541  /**
1542  * If the computation represents a library call, this will contain the
1543  * necessary arguments to the function.
1544  */
1545  std::vector<tiramisu::expr> library_call_args;
1546 
1547  /**
1548  * If the computation represents a library call and requires a linear index (for either a LHS or RHS access),
1549  * this indicates which argument to the function represents the linear index.
1550  */
1551  // @{
1552  int lhs_argument_idx;
1553  int rhs_argument_idx;
1554  // @}
1555 
1556  /**
1557  * If the computation represents a library call and requires a wait, this indicates which argument to the function
1558  * represents the linear index.
1559  */
1560  int wait_argument_idx;
1561 
1562  /**
1563  * If the computation represents a library call and requires a wait, this is like a regular access map, but gives
1564  * an access into a special wait buffer.
1565  */
1566  isl_map *wait_access_map = nullptr;
1567 
1568  /**
1569  * By default, this becomes an o_access to signify that we are writing into a location. However, it can be changed
1570  * to something like o_address_of to indicate that we don't want to do a store to the location, rather we just
1571  * want to get a pointer to the store location.
1572  */
1573  // @{
1574  tiramisu::op_t lhs_access_type;
1575  tiramisu::op_t rhs_access_type;
1576  // @}
1577 
1578  /**
1579  * Apply a transformation on the domain of the schedule.
1580  * This is a transformation from iteration domain to the time-processor
1581  * domain.
1582  *
1583  * For example, to apply to shift the i dimension of the iteration domain
1584  * of C0, you can apply the transformation
1585  *
1586  * C0[i, j] -> C0[i+2, j]
1587  */
1588  void apply_transformation_on_schedule_domain(std::string map_str);
1589 
1590  /**
1591  * Add the set of constraints \p domain_constraints to the domain
1592  * of the schedule and add the set of constraints \p range_constraints
1593  * to the range of the schedule.
1594  *
1595  * \p domain_constraints and \p range_constraints are both ISL sets in
1596  * the ISL string format.
1597  */
1598  void add_schedule_constraint(std::string domain_constraints, std::string range_constraints);
1599 
1600  /**
1601  * Check that the names used in \p dimensions are not already
1602  * in use.
1603  */
1604  void assert_names_not_assigned(std::vector<std::string> dimensions);
1605 
1606  /**
1607  * Return true if a buffer was allocated to this computation or to one
1608  * of its updates (we assume that we allocate the same buffer for the
1609  * computation and its updates).
1610  */
1611  bool buffer_already_allocated();
1612 
1613  /**
1614  * Identical to
1615  * void before(computation &consumer, tiramisu::var L);
1616  * Except that the loop level in this case is identified using its number. This is
1617  * used mainly internally by Tiramisu while the other is designed to be user friendly.
1618  */
1619  void before(computation &consumer, int L);
1620 
1621  /**
1622  * Check that the \p dimensions are valid:
1623  * - The dimension numbers are within the bounds of accepted dimensions
1624  * (i.e., between computation::root_dimension and the maximal dimension
1625  * number in the time-space domain.
1626  */
1627  void check_dimensions_validity(std::vector<int> dimensions);
1628 
1629  /**
1630  * Compute two subsets of computations:
1631  * - the first is the subset of needed computations,
1632  * - the second is the subset of produced computations,
1633  *
1634  * The needed computations are those that need to be consumed
1635  * by the \p consumer if this computation is to be executed in the
1636  * same loop as the consumer but at loop level \p L.
1637  *
1638  * The produced subset represent are the computations produced by
1639  * this computation at the level \p L.
1640  *
1641  * This function takes in consideration the schedule of
1642  * this computation and the schedule of consumer in order to
1643  * compute the needed area.
1644  *
1645  * This function creates new parameters, these parameters are used to specialize
1646  * the needed area for specific loop iterators (i.e., fix the needed area).
1647  * These parameters are stored in \p param_names.
1648  * This vector is used internally by Tiramisu in compute_at() which calls this
1649  * function.
1650  */
1651  std::vector<isl_set *> compute_needed_and_produced(computation &consumer, int L,
1652  std::vector<std::string> &param_names);
1653 
1654 
1655  /**
1656  * Take a list of iterators and a name as input and construct the iteration
1657  * domain that iterates over those iterators. The name of the statement
1658  * of the iteration domain is \p name.
1659  * The iteration domain is a string in ISL format.
1660  */
1661  std::string construct_iteration_domain(std::string name, std::vector<var> iterator_variables);
1662 
1663  /**
1664  * Create a copy of this computation.
1665  */
1666  tiramisu::computation *copy();
1667 
1668  /**
1669  * Create a Halide statement that assigns the computations to the memory
1670  * buffer and location specified by the access function.
1671  */
1672  void create_halide_assignment();
1673 
1674  /**
1675  * Returns a pair of expressions (LHS, RHS) representing the assignment performed
1676  * by the computation.
1677  * The LHS would represent a memory buffer access, and the RHS would represent
1678  * the value of the assignment.
1679  */
1680  std::pair<expr, expr> create_tiramisu_assignment(std::vector<isl_ast_expr *> &index_expr);
1681 
1682  /**
1683  * Apply a duplication transformation from iteration space to
1684  * time-processor space.
1685  * A duplication transformation duplicates the original computation,
1686  * so the domain of the schedule has to be the iteration domain of
1687  * the original computation.
1688  *
1689  * For example, to duplicate C0 into a first duplicate:
1690  *
1691  * C0[i, j] -> C0[1, 0, i, 0, j, 0]
1692  *
1693  * To duplicate C0 again
1694  *
1695  * C0[i, j] -> C0[2, 0, j, 0, i, 0]
1696  *
1697  */
1698  void create_duplication_transformation(std::string map_str);
1699 
1700  /*
1701  * Create a new Tiramisu constant M = v*floor(N/v) and use it as
1702  * a separator.
1703  *
1704  * The separator is used to separate a computation. That
1705  * is, it is used to create two identical computations where we have
1706  * a constraint like i<M in the first and i>=M in the second.
1707  * The first is called the full computation while the second is called
1708  * the separated computation.
1709  *
1710  * This function is used in vectorize and unroll mainly.
1711  */
1712  tiramisu::constant *create_separator(const tiramisu::expr &loop_upper_bound, int v);
1713 
1714  /**
1715  * Duplicate a part of this computation (or all of it) and return
1716  * the duplicate computation. The duplicate computation is identical
1717  * to this computation in every aspect except that its iteration domain
1718  * is the intersection of the iteration domain of the original computation
1719  * and the domain provided in \p domain_constraints.
1720  *
1721  * Example: let us assume that you have the following computation
1722  *
1723  * {C0[i,j]: 0<=i<N and 0<=j<N}
1724  *
1725  * If you want to create a duplicate of this computation
1726  * that has the following iteration domain
1727  *
1728  * "{C0[i,j]: 0<=i<=10 and 0<=j<=5"
1729  *
1730  * you can write
1731  *
1732  * C0.duplicate("{C0[i,j]: 0<=i<=10 and 0<=j<=5");
1733  *
1734  * This will keep the original computation C0 and will create a
1735  * new computation (duplicate of C0) that has
1736  * "{C0[i,j]: 0<=i<=10 and 0<=j<=5". as iteration domain.
1737  * (duplicate() in fact intersects the domain provided with
1738  * the original domain).
1739  *
1740  * The duplicate computation is an exact copy of the original
1741  * computation except in one thing:
1742  * The iteration domain of the duplicate computation is
1743  * the intersection of the iteration domain of the original
1744  * computation with the constraints provided as an argument
1745  * to the duplicate command; this means that the iteration domain
1746  * of the duplicate computation is always a subset of the original
1747  * iteration domain;
1748  *
1749  * If you schedule a computation C, then duplicate it, then the duplicate
1750  * will have exactly the same schedule as the original computation.
1751  * After duplication, any schedule applied on the original computation
1752  * will not be applied automatically on the duplicate computation. They
1753  * become two separate computations that need to be scheduled separately.
1754  *
1755  * \p domain_constraints is a set of constraints on the iteration domain that
1756  * define the duplicate.
1757  * \p range_constraints is a set of constraints on the time-processor domain
1758  * that define the duplicate.
1759  */
1760  tiramisu::computation *duplicate(std::string domain_constraints,
1761  std::string range_constraints);
1762 
1763  /**
1764  * Return the access function of the computation.
1765  */
1766  isl_map *get_access_relation() const;
1767 
1768  /**
1769  * Return the access function of the computation after transforming
1770  * it to the time-processor domain.
1771  * The domain of the access function is transformed to the
1772  * time-processor domain using the schedule, and then the transformed
1773  * access function is returned.
1774  */
1775  isl_map *get_access_relation_adapted_to_time_processor_domain() const;
1776 
1777  /**
1778  * Return vector of associated let statements.
1779  *
1780  * This is a vector that contains the list of let statements
1781  * associated with this computation.
1782  *
1783  * A let statement that is associated with the computation is a
1784  * let statement that will be added just before the computation.
1785  * The scope of the variable defined by the let statement is this
1786  * computation alone. i.e., it is not defined in other computations.
1787  */
1788  const std::vector<std::pair<std::string, tiramisu::expr>> &get_associated_let_stmts() const;
1789 
1790  /**
1791  * Get the name of dynamic dimension that corresponds to the
1792  * \p loop_level in the time-space domain.
1793  */
1794  std::string get_dimension_name_for_loop_level(int loop_level);
1795 
1796  /**
1797  * Return the number of the duplicates of this computation.
1798  *
1799  * The number of duplicates is incremented if the computation is duplicated using
1800  * the duplicate() function.
1801  */
1802  int get_duplicates_number() const;
1803 
1804  /**
1805  * If has_multiple_definitions() is true, then this function returns the
1806  * computation that was defined first among all the multiple definitions.
1807  */
1808  tiramisu::computation *get_first_definition();
1809 
1810  /**
1811  * Return the function where the computation is declared.
1812  */
1813  tiramisu::function *get_function() const;
1814 
1815  /**
1816  * Return the Halide statement that assigns the computation to a buffer location.
1817  * Before calling this function the user should first call Halide code generation
1818  * (function::gen_halide_stmt()).
1819  */
1820  Halide::Internal::Stmt get_generated_halide_stmt() const;
1821 
1822  /**
1823  * Return vector of isl_ast_expr representing the indices of the array where
1824  * the computation will be stored.
1825  */
1826  std::vector<isl_ast_expr *> &get_index_expr();
1827 
1828  /**
1829  * Get the union of the iteration domains of this computations and
1830  * all the other definitions of this computations (updates,
1831  * duplicates, ...).
1832  */
1833  isl_set *get_iteration_domains_of_all_definitions();
1834 
1835 
1836  /**
1837  * The iterators map is map between the original names of the iterators of a computation
1838  * and their transformed form after schedule (also after renaming).
1839  *
1840  * If in the original computation, we had
1841  *
1842  * {C[i0, i1]: ...}
1843  *
1844  * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
1845  * the loops are tiled, then the map will be
1846  *
1847  * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
1848  */
1849  std::map<std::string, isl_ast_expr *> get_iterators_map();
1850 
1851  /**
1852  * Return the names of iteration domain dimensions.
1853  */
1854  std::vector<std::string> get_iteration_domain_dimension_names();
1855 
1856  /**
1857  * Get the number of dimensions of the iteration
1858  * domain of the computation.
1859  */
1860  int get_iteration_domain_dimensions_number();
1861 
1862  /**
1863  * Return the names of loop levels.
1864  * (i.e., the names of dynamic dimensions in time-space).
1865  */
1866  std::vector<std::string> get_loop_level_names();
1867 
1868  /**
1869  * Get the number of loop levels.
1870  */
1871  int get_loop_levels_number();
1872 
1873  /**
1874  * Gets the span of the loop level \p level (number of iterations) and returns
1875  * it as a tiramisu::expr.
1876  */
1877  expr get_span(int level);
1878 
1879  /**
1880  * Return the root of the tree of definitions of this computation.
1881  * The tree of definitions is the tree that emerges after adding multiple
1882  * definitions to the same computation. Let's take the following example.
1883  * Assuming we have the computation c0. We can add a definition c1 to c0.
1884  * The tree in this case would have two nodes, c0 and c1. c0 is the root.
1885  * We can add another defintion c2 to c0. Then we can add another defintion c3
1886  * to c1. In this case the tree would have four nodes where c0 is the root.
1887  */
1888  tiramisu::computation *get_root_of_definition_tree();
1889 
1890  /**
1891  * Get the number of dimensions of the time-space
1892  * domain of the computation.
1893  */
1894  int get_time_space_dimensions_number();
1895 
1896  /**
1897  * Trim the union of schedules of the computation and
1898  * return the result.
1899  * The trimmed union of schedules is the schedule without the
1900  * duplicate dimension (the dimension used to indicate the
1901  * duplicate ID).
1902  */
1903  isl_map *get_trimmed_union_of_schedules() const;
1904 
1905  /**
1906  * Return the time-processor domain of the computation.
1907  * In this representation, the logical time of execution and the
1908  * processor where the computation will be executed are both
1909  * specified.
1910  */
1911  isl_set *get_time_processor_domain() const;
1912 
1913  /**
1914  * Return the trimmed time-processor domain.
1915  * The first dimension of the time-processor domain is used
1916  * to indicate redundancy of the computation. Computations that
1917  * are not redundant have 0 in that dimension, whereas
1918  * computations that are redundant (i.e., are recomputed) have
1919  * a number different from 0 in that dimension and which represents
1920  * the ID of the redundant computation.
1921  * The trimmed time-processor domain is the time-processor domain
1922  * without the dimension that represents the redundancy. We simply
1923  * take the time-processor domain and remove the first dimension.
1924  */
1925  isl_set *get_trimmed_time_processor_domain();
1926 
1927  /**
1928  * Generate an identity schedule for the computation.
1929  *
1930  * This identity schedule is an identity relation created from the iteration
1931  * domain.
1932  */
1933  isl_map *gen_identity_schedule_for_iteration_domain();
1934 
1935  /**
1936  * Generate an identity schedule for the computation.
1937  *
1938  * This identity schedule is an identity relation created from the
1939  * time-processor domain.
1940  */
1941  isl_map *gen_identity_schedule_for_time_space_domain();
1942 
1943  /**
1944  * Returns all updates the have been defined for this computation using
1945  * add_definitions. The 0th update is a pointer to this computation.
1946  */
1947  std::vector<computation*>& get_updates();
1948 
1949  /**
1950  * Search the time-space domain (the range of the schedule) and
1951  * return the loop level numbers that correspond to the dimensions
1952  * named \p dim.
1953  * In other words, translate the vector of dimension names (\p dim_names)
1954  * into loop level numbers. We need to do this because the internal Tiramisu
1955  * scheduling functions use dimension numbers instead of dimension
1956  * names (which are used in the user level scheduling functions).
1957  */
1958  std::vector<int> get_loop_level_numbers_from_dimension_names(std::vector<std::string> dim_names);
1959 
1960  /**
1961  * Return true if this computation is supposed to have an access to other
1962  * computations.
1963  * Knowing this is important so that tiramisu does not try to compute
1964  * the accesses of the RHS.
1965  */
1966  bool has_accesses() const;
1967 
1968  /**
1969  * Return if this computation represents a let statement.
1970  *
1971  * Let statements should be treated differently because:
1972  * - A let statement does not have/need an access function because
1973  * it writes directly to a scalar.
1974  * - If the backend is Halide:
1975  * - During Halide code generation a Halide let statement
1976  * should be created instead of an assignment statement.
1977  * - When targeting Halide, let statements should be created
1978  * after their body is created, because the body is an argument
1979  * needed for the creation of the let statement.
1980  */
1981  bool is_let_stmt() const;
1982 
1983  /**
1984  * Return true if this computation represents a call to a library function?
1985  * Mainly used for distributed communication functions.
1986  */
1987  bool is_library_call() const;
1988 
1989  /**
1990  * Return true if the rank loop iterator should be removed from linearization.
1991  */
1992  bool should_drop_rank_iter() const;
1993 
1994  int get_level_to_drop();
1995 
1996  /**
1997  * Assign a name to iteration domain dimensions that do not have a name.
1998  */
1999  void name_unnamed_iteration_domain_dimensions();
2000 
2001  /**
2002  * Assign a name to iteration domain dimensions that do not have a name.
2003  */
2004  void name_unnamed_time_space_dimensions();
2005 
2006  /**
2007  * Set an identity schedule for the computation.
2008  *
2009  * This identity schedule is an identity relation created from the iteration
2010  * domain.
2011  *
2012  * This sets the schedule of the original computation
2013  * and does not set the schedule of any duplicate
2014  * computation.
2015  */
2016  void set_identity_schedule_based_on_iteration_domain();
2017 
2018  /**
2019  * Return true if the this computation is supposed to be scheduled
2020  * by Tiramisu.
2021  */
2022  bool should_schedule_this_computation() const;
2023 
2024  /**
2025  * Intersect \p set with the context of the computation.
2026  */
2027  // @{
2028  isl_set *intersect_set_with_context(isl_set *set);
2029  isl_map *intersect_map_domain_with_context(isl_map *map);
2030  // @}
2031 
2032  /**
2033  * Rename this computation and modify the schedule and the access relation
2034  * accordingly.
2035  *
2036  * Computation that are defined multiple times need to be renamed, because
2037  * those computations in general have different expressions and the code
2038  * generator expects that computations that have the same name always have
2039  * the same expression and access relation. So we should rename them to avoid
2040  * any ambiguity for the code generator.
2041  */
2042  void rename_computation(std::string new_name);
2043 
2044  /**
2045  * Separate the iteration domain into two iteration domains using
2046  * the constant \p C.
2047  * Let us assume that the dimension \p dim of the iteration domain
2048  * is called i. The iteration domain is separated into two domains
2049  * using the hyperplane (i = v*floor(N/v)). That means, two copies of the
2050  * iteration domain are created, the constraint (i<=v*floor(N/v)) is added to
2051  * the schedule of the first while the constrain (i>v*floor(N/v)) is added to
2052  * the schedule of the second.
2053  *
2054  * Let us assume that we have the following iteration domain
2055  *
2056  * {S0[i,j]: 0<=i<N and 0<=j<M}
2057  *
2058  * To separate this iteration domain by the hyperplane j=4*floor(M/4), one should
2059  * call
2060  *
2061  * S0.separate(1, tiramisu::expr("M"), 4)
2062  *
2063  * This will result in the creation of two computations that have the same
2064  * iteration domains but have different schedules. The schedules are as
2065  * follows:
2066  *
2067  * The schedule of the original (full) computation would be
2068  * {S0[i,j]->S0[0, 0, i, 0, j, 0]: j<4*floor(M/4)}
2069  *
2070  * The schedule of the separated (partial) computation would be
2071  * {S0[i]->S0[0, 0, i, 10, j, 0]: 4*floor(M/4)<=j}
2072  *
2073  * The second computation created using separate can be accessed with
2074  * get_update().
2075  */
2076  void separate(int dim, tiramisu::expr N, int v);
2077 
2078  /**
2079  * Like separate, but the precise separate points are specified in _separate_points. _max is the loop max.
2080  */
2081  void separate_at(var level, std::vector<tiramisu::expr> _separate_points, tiramisu::expr _max);
2082 
2083  /**
2084  * Separate then split the loop \p L0
2085  * (see tiramisu::computation::separate and tiramisu::computation::split)
2086  *
2087  * This function returns true if the split actually happened (in some
2088  * cases, if the loop cannot be split because it is too small for example,
2089  * then no split happens even after calling the split function, in such
2090  * cases the function returns false).
2091  */
2092  //@{
2093  bool separateAndSplit(tiramisu::var L0, int sizeX);
2094  bool separateAndSplit(tiramisu::var L0, int sizeX,
2095  tiramisu::var L0_outer, tiramisu::var L0_inner);
2096  bool separateAndSplit(int L0, int sizeX);
2097  //@}
2098 
2099  /**
2100  * Split the loop level \p L0 of the iteration space into two
2101  * new loop levels.
2102  *
2103  * \p sizeX is the extent (size) of the inner loop created after
2104  * splitting.
2105  */
2106  //@{
2107  //@}
2108 
2109  /**
2110  * Set the names of loop levels dimensions.
2111  * The loop levels are specified using \p loop_levels
2112  * and their names are specified using \p names.
2113  * Users can only set the names of loop levels (dynamic dimensions),
2114  * the static dimension names are set to default names.
2115  */
2116  void set_loop_level_names(std::vector<int> loop_levels, std::vector<std::string> names);
2117 
2118  /**
2119  * Set the names of loop level dimensions.
2120  * The loop level names are specified using \p names.
2121  * Users can only set the names of loop levels (dynamic dimensions),
2122  * the static dimension names are set to default names.
2123  */
2124  void set_loop_level_names(std::vector<std::string> names);
2125 
2126  /**
2127  * Set the names of the dimensions of the schedule domain.
2128  * The dimension names are specified using \p names, their positions
2129  * are indicated using \p loop_levels.
2130  */
2131  void set_schedule_domain_dim_names(std::vector<int> loop_levels, std::vector<std::string> names);
2132 
2133  /**
2134  * Set the iteration domain of the computation
2135  */
2136  void set_iteration_domain(isl_set *domain);
2137 
2138  /**
2139  * Set whether a computation has multiple definitions.
2140  * i.e., whether the computation is defined multiple times (i.e.,
2141  * whether there are many computations with the same name). An
2142  * update is a special case of a computation defined multiple
2143  * times.
2144  */
2145  void set_has_multiple_definitions(bool val);
2146 
2147  /**
2148  * The iterators map is map between the original names of the iterators of a computation
2149  * and their transformed form after schedule (also after renaming).
2150  *
2151  * If in the original computation, we had
2152  *
2153  * {C[i0, i1]: ...}
2154  *
2155  * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
2156  * the loops are tiled, then the map will be
2157  *
2158  * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
2159  */
2160  void set_iterators_map(std::map<std::string, isl_ast_expr *> map);
2161 
2162  /**
2163  * Identical to
2164  * void shift(tiramisu::var L0, int n);
2165  */
2166  void shift(int L0, int n);
2167 
2168  /**
2169  * Simplify \p set using the context and by calling
2170  * set coalescing.
2171  */
2172  // @{
2173  isl_set *simplify(isl_set *set);
2174  isl_map *simplify(isl_map *map);
2175  // @}
2176 
2177  /**
2178  * Identical to
2179  * \code
2180  * void tag_gpu_level(tiramisu::var L0, tiramisu::var L1);
2181  * void tag_gpu_level(tiramisu::var L0, tiramisu::var L1,
2182  * tiramisu::var L2, tiramisu::var L3);
2183  * void tag_gpu_level(tiramisu::var L0, tiramisu::var L1,
2184  * tiramisu::var L2, tiramisu::var L3,
2185  * tiramisu::var L4, tiramisu::var L5);
2186  * \endcode
2187  * The outermost loop level is 0.
2188  */
2189  // @{
2190  void tag_gpu_level(int L0, int L1);
2191  void tag_gpu_level(int L0, int L1, int L2, int L3);
2192  void tag_gpu_level(int L0, int L1, int L2, int L3, int L4, int L5);
2193  // @}
2194 
2195  /**
2196  * Tag the loop level \p L0 and \p L1 to be mapped to GPU block
2197  * dimensions.
2198  *
2199  * The outermost loop level is 0.
2200  */
2201  // @{
2202  void tag_gpu_block_level(int L0);
2203  void tag_gpu_block_level(int L0, int L1);
2204  void tag_gpu_block_level(int L0, int L1, int L2);
2205  // @}
2206 
2207  /**
2208  * Tag the loop level \p L0 and \p L1 to be mapped to GPU thread
2209  * dimensions.
2210  *
2211  * The outermost loop level is 0.
2212  */
2213  // @{
2214  void tag_gpu_thread_level(int L0);
2215  void tag_gpu_thread_level(int L0, int L1);
2216  void tag_gpu_thread_level(int L0, int L1, int L2);
2217  // @}
2218 
2219  /**
2220  * Contains a list of all definitions added to this computation. The 0th definition is
2221  * always this very computation.
2222  */
2223  std::vector<tiramisu::computation *> updates;
2224 
2225  /**
2226  * Update loop level names. This function should be called after each scheduling operation
2227  * because scheduling usually changes the original loop level names.
2228  * This function erases \p nb_loop_levels_to_erase loop level names starting from the
2229  * loop level \p start_erasing. It then inserts the loop level names \p new_names in
2230  * \p start_erasing. In other words, it replaces the names of loop levels from
2231  * \p start_erasing to \p start_erasing + \p nb_loop_levels_to_erase with the loop levels
2232  * indicated by \p new_names. This function sets the non erased loop levels to be equal to the
2233  * original loop level names.
2234  *
2235  * \p original_loop_level_names : a vector containing the original loop level names (loop level
2236  * names before scheduling).
2237  *
2238  * \p new_names : the new loop level names.
2239  *
2240  * \p start_erasing : start erasing loop levels from this loop level.
2241  *
2242  * \p nb_loop_levels_to_erase : number of loop levels to erase.
2243  *
2244  * Example. Assuming the original loop levels are {i0, i1, i2, i3}
2245  *
2246  * Calling this->update_names({i0, i1, i2, i3}, {x0, x1}, 1, 2) updates the loop levels to become
2247  * {i0, x0, x1, i3}.
2248  */
2249  void update_names(std::vector<std::string> original_loop_level_names, std::vector<std::string> new_names,
2250  int start_erasing, int nb_loop_levels_to_erase);
2251 
2252  /**
2253  * A vector describing the access variables in the original definition of a computation.
2254  * For every named dimension, a pair representing the index of the named dimension
2255  * and the name of the dimension is added to access_variables.
2256  * E.g. if a computation is defined as S[i, 0, j], access_variables will contain
2257  * {(0, "i"), (2, "j")}.
2258  */
2259  std::vector<std::pair<int, std::string>> access_variables;
2260 
2261  /**
2262  * Compare two computations.
2263  *
2264  * Two computations are considered to be equal if they have the
2265  * same name.
2266  */
2267  bool operator==(tiramisu::computation comp1);
2268 
2269 protected:
2270 
2271  /**
2272  * True if this computation represents a library call.
2273  */
2275 
2276  /**
2277  * If the computation represents a library call, this is the name of the function.
2278  */
2279  std::string library_call_name;
2280 
2281  /**
2282  * An index expression just for the request buffer.
2283  */
2284  isl_ast_expr *wait_index_expr;
2285 
2286  /**
2287  * Dummy constructor for derived classes.
2288  */
2289  computation();
2290 
2291  /**
2292  * Compute the size of the buffer allocated automatically to hold the
2293  * results of this computation.
2294  */
2295  std::vector<tiramisu::expr>* compute_buffer_size();
2296 
2297  /**
2298  * Return the context of the computations.
2299  */
2300  isl_ctx *get_ctx() const;
2301 
2302  /**
2303  * Return the predicate around this computation if a predicate
2304  * was added using add_predicate().
2305  */
2306  tiramisu::expr get_predicate();
2307 
2308  /**
2309  * Return a unique name of computation; made of the following pattern:
2310  * [computation name]@[computation address in memory]
2311  */
2312  const std::string get_unique_name() const;
2313 
2314  /**
2315  *
2316  * Return true if the computation has multiple definitions.
2317  * i.e., if the computation is defined multiple times.
2318  * An update is a special case where a computation is defined
2319  * multiple times. Duplicate computations are another example.
2320  *
2321  * In the following example, C is defined multiple times whereas
2322  * D is defined only once.
2323  *
2324  * \code
2325  *
2326  * C(0) = 0
2327  *
2328  * C(i) = C(i-1) + 1
2329  *
2330  * D(i) = C(i) + 1
2331  *
2332  * \endcode
2333  */
2334  bool has_multiple_definitions();
2335 
2336  /**
2337  * Initialize a computation.
2338  * This is a private function that should not be called explicitly
2339  * by users.
2340  */
2341  void init_computation(std::string iteration_space_str,
2342  tiramisu::function *fct,
2343  const tiramisu::expr &e,
2344  bool schedule_this_computation,
2346 
2347  /**
2348  * Set the name of the computation.
2349  */
2350  void set_name(const std::string &n);
2351 
2352  /**
2353  * Set the schedule indicated by \p map.
2354  *
2355  * \p map is a string that represents a mapping from the iteration domain
2356  * to the time-space domain (the ISL format to represent maps is
2357  * documented in http://barvinok.gforge.inria.fr/barvinok.pdf in Sec 1.2.2).
2358  *
2359  * The schedule is a map from the iteration domain to a time space
2360  * domain. The same name of space should be used for both the range
2361  * and the domain of the schedule.
2362  *
2363  * In general, users can set the schedule using high level functions such
2364  * as before(), after(), tile(), compute_at(), vectorize(), split(), ...
2365  * The use of this function is only reserved for advanced users who want
2366  * a low level control of the schedule.
2367  *
2368  * Vectors in the time-space domain have the following form
2369  *
2370  * computation_name[redundancy_ID,static,dynamic,static,dynamic,static,dynamic,static,...]
2371  *
2372  * The first dimension of the vector is used to indicate the redundancy ID
2373  * (the notion of the redundancy ID is explained later).
2374  *
2375  * The following dimensions are interleaved dimensions: static, dynamic, static,
2376  * dynamic, ...
2377  * Dynamic dimensions represent the loop levels, while static dimensions are
2378  * used to order statements within a given block of statements in a given loop
2379  * level.
2380  * For example, the computations c0 and c1 in the following loop nest
2381  *
2382  * \code
2383  * for (i=0; i<N: i++)
2384  * for (j=0; j<N; j++)
2385  * {
2386  * c0;
2387  * c1;
2388  * }
2389  *
2390  * \endcode
2391  *
2392  * have the following representations in the iteration domain
2393  *
2394  * \code
2395  * {c0[i,j]: 0<=i<N and 0<=j<N}
2396  * {c1[i,j]: 0<=i<N and 0<=j<N}
2397  * \endcode
2398  *
2399  * and the following representation in the time-space domain
2400  *
2401  * \code
2402  * {c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
2403  * {c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
2404  * \endcode
2405  *
2406  * The first dimension (dimension 0) in the time-space
2407  * domain (the leftmost dimension) is the redundancy ID
2408  * (in this case it is 0, the meaning of this ID will be explained later).
2409  * The second dimension (starting from the left) is a static dimension,
2410  * the third dimension is a dynamic dimension that
2411  * represents the loop level i, ..., the fifth dimension is a dynamic
2412  * dimension that represents the loop level j and the last dimension
2413  * (dimension 5) is a static dimension and allows the ordering of
2414  * c1 after c0 in the loop nest.
2415  *
2416  * To transform the previous iteration domain to the
2417  * time-space domain, the following schedule should be used:
2418  *
2419  * \code
2420  * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
2421  * {c1[i,j]->c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
2422  * \endcode
2423  *
2424  * The first dimension called "redundancy ID" is only meaningful if the
2425  * computation is redundant. i.e., some parts of the computation are
2426  * redundantly computed. Redundant computations are in general used to
2427  * maximize parallelism and data locality on the expense of doing some
2428  * computations redundantly.
2429  * For example, if the two computations c1(i,j) and c2(i,j) both depend
2430  * on the computation c0(i,j), instead of waiting for c0(i,j) to be
2431  * computed and then computing c1(i,j) and c2(i,j) in parallel, the thread
2432  * executing c1(i,j) can compute c0(i,j) by itself and then run c1(i,j).
2433  * The thread that computes c2(i,j) can do the same and compute c0(i,j)
2434  * by itself and then compute c2(i,j). In this case the two threads do not
2435  * need to wait. This is done at the expense of redundant computation since
2436  * c0(i,j) is computed by both threads.
2437  *
2438  * In general redundant computations are useful when tiling stencil
2439  * computations. In the context of stencils such a tiling is called
2440  * "overlapped tiling". Tiles that depend on results computed by other
2441  * tiles that run in parallel can compute the boundaries redundantly which
2442  * allows them to avoid waiting and thus can run in parallel.
2443  *
2444  * In Tiramisu, the user can indicate that a chunk of a computation
2445  * should be computed redundantly. The original computation always has a redundancy
2446  * ID equal to 0 (which means this is the original computation).
2447  * The redundant chunk has an ID that is different from 0 and that is
2448  * used to uniquely identify it.
2449  *
2450  * For example if we want to compute all of c0 three times (that is,
2451  * compute the original computation and compute two redundant computations),
2452  * we can use the following schedules:
2453  *
2454  * The schedule of the original computation: {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
2455  * The schedule of the redundant computation N°1: {c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N}
2456  * The schedule of the redundant computation N°2: {c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
2457  *
2458  * The schedule of c0 in this case would be three maps that map c0[i,j] to
2459  * the three different redundant computations in the time-processor domain:
2460  *
2461  * \code
2462  * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N;
2463  * c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N;
2464  * c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
2465  * \endcode
2466  *
2467  * The function set_schedule() overrides any other schedule set by the high level
2468  * scheduling functions. Currently the user has to choose between using the high
2469  * level scheduling functions or using this low level set_schedule function. The user
2470  * cannot mix the use of the two in the same program because they are not compatible.
2471  */
2472  // @{
2473  void set_schedule(isl_map *map);
2474  void set_schedule(std::string map_str);
2475  // @}
2476 
2477  /**
2478  * Collapse all the iterations of a loop into one single iteration.
2479  */
2480  void full_loop_level_collapse(int level, tiramisu::expr collapse_from_iter);
2481  /**
2482  * \overload
2483  */
2484  computation(std::string name,std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation, primitive_t t);
2485 
2486 
2487 
2488 public:
2489 
2490  /**
2491  * \brief Constructor for computations.
2492  *
2493  * \details \p iteration_domain is a string that represents the iteration
2494  * domain of the computation. The iteration domain should be written
2495  * in the ISL format (http://barvinok.gforge.inria.fr/barvinok.pdf Section 1.2.1).
2496  *
2497  * The iteration domain of a statement is a set that contains
2498  * all of the execution instances of the statement (a statement in a
2499  * loop has an execution instance for each loop iteration in which
2500  * it executes). Each execution instance of a statement in a loop
2501  * nest is uniquely represented by an identifier and a tuple of
2502  * integers (typically, the values of the outer loop iterators).
2503  *
2504  * For example, the iteration space of the statement S0 in the following
2505  * loop nest
2506  *
2507  * \code
2508  * for (i=0; i<2; i++)
2509  * for (j=0; j<3; j++)
2510  * S0;
2511  * \endcode
2512  *
2513  * is {S0[0,0], S0[0,1], S0[0,2], S0[1,0], S0[1,1], S0[1,2]}
2514  *
2515  * S0[0,0] is the execution instance of S0 in the iteration [0,0].
2516  *
2517  * The previous set of integer tuples can be compactly described
2518  * by affine constraints as follows
2519  *
2520  * {S0[i,j]: 0<=i<2 and 0<=j<3}
2521  *
2522  * In general, the loop nest
2523  *
2524  * \code
2525  * for (i=0; i<N; i++)
2526  * for (j=0; j<M; j++)
2527  * S0;
2528  * \endcode
2529  *
2530  * has the following iteration domain
2531  *
2532  * {S0[i,j]: 0<=i<N and 0<=j<M}
2533  *
2534  * This should be read as: the set of points [i,j] such that
2535  * 0<=i<N and 0<=j<M.
2536  *
2537  * The name of the computation in the iteration domain should not
2538  * start with _ (an underscore). Names starting with _ are reserved
2539  * names.
2540  *
2541  * \p e is the expression computed by the computation.
2542  * It is possible to declare the computation without specifying the expression.
2543  * The expression can be specified later using computation::set_expression().
2544  * An example of setting the expression after declaring the computation
2545  * is presented in tests/test_04.cpp.
2546  *
2547  * \p schedule_this_computation should be set to true if the computation
2548  * is supposed to be schedule and code is supposed to be generated from
2549  * the computation. Set it to false if you just want to use the
2550  * computation to represent a buffer (that is passed as an argument
2551  * to the function) and you do not intend to generate code for the
2552  * computation.
2553  * An example where this argument is set to false is presented in
2554  * tests/test_14.cpp.
2555  *
2556  * \p t is the type of the computation, i.e. the type of the expression
2557  * computed by the computation. Example of types include (p_uint8,
2558  * p_uint16, p_uint32, ...).
2559  *
2560  * \p fct is a pointer to the Tiramisu function where this computation
2561  * should be added.
2562  *
2563  * Bound Inference:
2564  * The user can declare computations without providing any constraint
2565  * about the iteration domain, in this case he can rely on bound inference
2566  * to infer the constraints about each iteration domain. The user needs only
2567  * to provide constraints over the domains of the last computations (last
2568  * consumers), and Tiramisu will propagate these constraints to all the
2569  * chain of computations that precede those consumers.
2570  * Note that bound inference is not possible if you have multiple definitions
2571  * of the same computation. In such a case, you should provide constraints
2572  * over the iteration domain when you declare the computation.
2573  *
2574  * Examples about bound inference are provided in test_22 to test_25.
2575  */
2576  computation(std::string iteration_domain, tiramisu::expr e,
2577  bool schedule_this_computation, tiramisu::primitive_t t,
2578  tiramisu::function *fct);
2579 
2580  /**
2581  * \brief Constructor for computations.
2582  *
2583  * \details
2584  *
2585  * Same as \ref tiramisu::computation::computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e)
2586  * except that is has the following additional argument:
2587  *
2588  * \p schedule_this_computation indicates whether this computation should to
2589  * be scheduled.
2590  */
2591  computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation);
2592 
2593  /**
2594  * \overload
2595  */
2596  computation(std::vector<var> iterator_variables, tiramisu::expr e);
2597 
2598 
2599  /**
2600  * \brief Constructor for computations.
2601  *
2602  * \details
2603  *
2604  * Declare a computation. A computation in Tiramisu is the equivalent of a
2605  * a statement surrounded by a loop nest in C (an example is provided later).
2606  *
2607  * \p name is the name of the computation.
2608  *
2609  * \p iterator_variables is a vector that represents the loop iterators
2610  * around the computation.
2611  *
2612  * \p e is the expression computed by the computation.
2613  *
2614  * For example, if we have two iterator variables
2615  *
2616  * \code
2617  * var i("i", 0, 20), j("j", 0, 30);
2618  * \endcode
2619  *
2620  * and we have the following computation declaration
2621  *
2622  * \code
2623  * computation S("S", {i,j}, 4);
2624  * \endcode
2625  *
2626  * This is equivalent to writing the following C code
2627  *
2628  * \code
2629  * for (i=0; i<20; i++)
2630  * for (j=0; j<30; j++)
2631  * S(i,j) = 4;
2632  * \endcode
2633  *
2634  * More precisely, the vector {i, j} specifies the iteration domain of
2635  * the computation S0. In this case, 0<=i<20 and 0<=j<30.
2636  *
2637  * It is possible to declare the computation without specifying the expression.
2638  * The expression can be specified later using computation::set_expression().
2639  * An example of setting the expression after declaring the computation
2640  * is presented in tutorial 04. Usually this is needed for writing
2641  * reductions.
2642  *
2643  */
2644  computation(std::string name, std::vector<var> iterator_variables, tiramisu::expr e);
2645 
2646  /**
2647  * \overload
2648  */
2649  computation(std::vector<var> iterator_variables, tiramisu::expr e, bool schedule_this_computation);
2650 
2651 
2652  /**
2653  * \overload
2654  *
2655  * \p t is the type of the computation, i.e. the type of the expression
2656  * computed by the computation. Example of types include (p_uint8,
2657  * p_uint16, p_uint32, ...).
2658  *
2659  * Usually, this constructor is used to declare buffer wrappers.
2660  *
2661  * For example, if we have two iterator variables
2662  *
2663  * \code
2664  * var i("i", 0, 20), j("j", 0, 30);
2665  * \endcode
2666  *
2667  * and we have the following computation declaration
2668  *
2669  * \code
2670  * computation S("S", {i,j}, p_uint8);
2671  * \endcode
2672  *
2673  * This can be used a wrapper on a buffer buf[20, 30] where the buffer elements
2674  * are of type uint8.
2675  */
2676  computation(std::string name, std::vector<var> iterator_variables, primitive_t t):
2677  computation(name, iterator_variables, expr())
2678  {
2679  this->data_type = t;
2680  this->expression.dtype = t;
2681  }
2682 
2683  /**
2684  * \overload
2685  */
2686  computation(std::vector<var> iterator_variables, primitive_t t):
2687  computation(iterator_variables, expr())
2688  {
2689  this->data_type = t;
2690  this->expression.dtype = t;
2691  }
2692 
2693  virtual bool is_send() const;
2694 
2695  virtual bool is_recv() const;
2696 
2697  virtual bool is_send_recv() const;
2698 
2699  virtual bool is_wait() const;
2700 
2701  /**
2702  * \brief Add a let statement that is associated to this computation.
2703  * \details The let statement will be executed before the computation
2704  * (more precisely, between this computation and any computation that
2705  * preceeds it). The variable defined by the let statement can be
2706  * accessible by this computation alone. i.e., it cannot be used
2707  * in any other computation.
2708  */
2709  void add_associated_let_stmt(std::string access_name, tiramisu::expr e);
2710 
2711  /**
2712  * Don't scheduled a previously scheduled computation
2713  */
2714  void unschedule_this_computation();
2715 
2716  /**
2717  * \brief Add definitions of computations that have the same name as this
2718  * computation.
2719  *
2720  * \details The arguments of this function are identical to the arguments of
2721  * the computation constructor. In general, this function is used to express
2722  * reductions and to express computation updates.
2723  *
2724  * In other words, this function should be used if the user has already declared
2725  * a set of computations C and wants to declare more computations that have the
2726  * same name.
2727  *
2728  * Example: Let's assume we want to declare the following two computations.
2729  *
2730  * \code
2731  * // First computation
2732  * {C[i]: 0<=i<10}: 0
2733  *
2734  * // Second computation
2735  * {C[i]: 10<=i<20}: 1
2736  * \endcode
2737  *
2738  * To do this this, we can declare the first computation using the computation
2739  * constructor and declare the second computation using add_definitions().
2740  *
2741  * The use of add_computation is purely due to restrictions imposed by the C++ language
2742  * and not by the Tiramisu framework itself. This is mainly because in C++, it is not
2743  * possible to declare two objects with the same name, for example one cannot do
2744  *
2745  * computation C(...);
2746  * computation C(...);
2747  *
2748  * In order to declare the second set of computations, we chose to use the add_definitions
2749  * function to avoid this problem.
2750  *
2751  * The newly added computations must have the same name and the same access function
2752  * as the initial set of computations but can have a different expression.
2753  *
2754  * An example of using this function is available in test_26.
2755  *
2756  */
2757  virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
2758  bool schedule_this_computation, tiramisu::primitive_t t,
2759  tiramisu::function *fct);
2760 
2761  /**
2762  * \brief Add a predicate (condition) on the computation. The computation will be executed
2763  * only if this condition is true.
2764  *
2765  * \details The predicate can be an affine or a non-affine expression.
2766  * If you need to put a condition around a block of statements (i.e., a sequence of computations),
2767  * then you can perform that by adding a predicate to each one of those computations.
2768  * The compiler will then transform automatically the multiple conditions (condition around each
2769  * computation) into one condition around the whole block.
2770  */
2771  void add_predicate(tiramisu::expr predicate);
2772 
2773  /**
2774  * \brief Schedule this computation to run after the computation \p comp.
2775  *
2776  * \details This computation is placed after \p comp in the loop level \p level.
2777  * \p level is a loop level in this computation.
2778  *
2779  * The root level is computation::root. The global variable
2780  * computation::root is equivalent to var("root").
2781  *
2782  * For example assuming we have the two computations
2783  *
2784  * {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
2785  *
2786  * In order to make S1 run after S0 in the i loop, one should use
2787  *
2788  * S1.after(S0, i)
2789  *
2790  * which means: S1 is after S0 at the loop level i (which is loop level 0).
2791  *
2792  * The corresponding code is
2793  *
2794  * \code
2795  * for (i=0; i<N; i++)
2796  * {
2797  * for (j=0; j<N; j++)
2798  * S0;
2799  * for (j=0; j<N; j++)
2800  * S1;
2801  * }
2802  * \endcode
2803  *
2804  * S1.after(S0, j)
2805  *
2806  * means: S1 is after S0 at the loop level j (which is 1) and would yield
2807  * the following code
2808  *
2809  * \code
2810  * for (i=0; i<N; i++)
2811  * for (j=0; j<N; j++)
2812  * {
2813  * S0;
2814  * S1;
2815  * }
2816  * \endcode
2817  *
2818  * S1.after(S0, computation::root)
2819  * means S1 is after S0 at the main program level and would yield
2820  * the following code
2821  *
2822  * \code
2823  * for (i=0; i<N; i++)
2824  * for (j=0; j<N; j++)
2825  * S0;
2826  * for (i=0; i<N; i++)
2827  * for (j=0; j<N; j++)
2828  * S1;
2829  * \endcode
2830  *
2831  * Note that as with all other scheduling methods:
2832  * - Calling this method with the same computations overwrites the level if it is
2833  * higher.
2834  * - A computation being scheduled after another computation at level L means it is
2835  * scheduled after that computation at all levels lower than L.
2836  * - There should be exactly one computation with no computation scheduled before it.
2837  * - Each other computation should have exactly one computation scheduled before it.
2838  *
2839  */
2840  void after(computation &comp, tiramisu::var iterator);
2841 
2842  /**
2843  * This function is equivalent to
2844  * void after(computation &comp, tiramisu::var iterator);
2845  * except that it uses loop level numbers (0, 1, 2, ...) instead of using loop variables
2846  * (tiramisu::var). Tiramisu internally represent loop levels using numbers instead
2847  * of variable names, and this is the actual function used internally.
2848  *
2849  * The outermost loop level is 0. The root level is computation::root_dimension.
2850  *
2851  * For example assuming we have the two computations
2852  *
2853  * {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
2854  *
2855  * In order to make S1 run after S0 in the i loop, one should use
2856  *
2857  * S1.after(S0,0)
2858  *
2859  * which means: S1 is after S0 at the loop level 0 (which is i).
2860  *
2861  * The corresponding code is
2862  *
2863  * \code
2864  * for (i=0; i<N; i++)
2865  * {
2866  * for (j=0; j<N; j++)
2867  * S0;
2868  * for (j=0; j<N; j++)
2869  * S1;
2870  * }
2871  * \endcode
2872  */
2873  void after(computation &comp, int level);
2874 
2875  /**
2876  * Schedule this computation to run after the computation \p comp.
2877  * The computations are placed after each other in the loop level \p level.
2878  * The outermost loop level is 0. The root level is computation::root_dimension.
2879  *
2880  * For example assuming we have the two computations
2881  *
2882  * {S0[i,j]: 0<=i<N and 0<=j<N} and {S1[i,j]: 0<=i<N and 0<=j<N}
2883  *
2884  * In order to make S1 run after S0 in the i loop, one should use
2885  *
2886  * S1.after_low_level(S0,0)
2887  *
2888  * which means: S1 is after S0 at the loop level 0 (which is i).
2889  *
2890  * The corresponding code is
2891  *
2892  * \code
2893  * for (i=0; i<N; i++)
2894  * {
2895  * for (j=0; j<N; j++)
2896  * S0;
2897  * for (j=0; j<N; j++)
2898  * S1;
2899  * }
2900  * \endcode
2901  *
2902  * S1.after_low_level(S0,1)
2903  *
2904  * means: S1 is after S0 at the loop level 1 (which is j) and would yield
2905  * the following code
2906  *
2907  * \code
2908  * for (i=0; i<N; i++)
2909  * for (j=0; j<N; j++)
2910  * {
2911  * S0;
2912  * S1;
2913  * }
2914  * \endcode
2915  *
2916  * S1.after_low_level(S0, computation::root_dimension)
2917  * means S1 is after S0 at the main program level and would yield
2918  * the following code
2919  *
2920  * \code
2921  * for (i=0; i<N; i++)
2922  * for (j=0; j<N; j++)
2923  * S0;
2924  * for (i=0; i<N; i++)
2925  * for (j=0; j<N; j++)
2926  * S1;
2927  * \endcode
2928  *
2929  * To specify that this computation is after \p comp in multiple levels,
2930  * the user can provide those levels in the \p levels vector.
2931  *
2932  * S1.after_low_level(S0, {0,1})
2933  *
2934  * means that S1 is after S0 in the loop level 0 and in the loop level 1.
2935  *
2936  * Note that
2937  *
2938  * S1.after_low_level(S0, L)
2939  *
2940  * would mean that S1 and S0 share the same loop nests for all the loop
2941  * levels that are before L and that S1 is after S0 in L only. S1 is not
2942  * after S0 in the loop levels that are before L.
2943  *
2944  */
2945  // @{
2946  void after_low_level(computation &comp, int level);
2947  void after_low_level(computation &comp, std::vector<int> levels);
2948  // @}
2949  /*
2950  * \brief Allocate a buffer for the computation automatically. The size of the buffer
2951  * is deduced automatically and a name is assigned to it automatically.
2952  *
2953  * \details Assuming the name of the computation is C, the name of the generated buffer
2954  * is _C_buffer.
2955  *
2956  * \p type is the type of the argument. Three possible types exist:
2957  * - a_input: for inputs of the function,
2958  * - a_output: for outputs of the function,
2959  * - a_temporary: for buffers used as temporary buffers within
2960  * the function (any temporary buffer is allocated automatically by
2961  * the Tiramisu runtime at the entry of the function and is
2962  * deallocated at the exit of the function).
2963  */
2964  void allocate_and_map_buffer_automatically(tiramisu::argument_t type = tiramisu::a_temporary);
2965 
2966  /**
2967  * \brief Apply a transformation on the schedule.
2968  *
2969  * \details This transformation is from
2970  * the time-space domain to the time-space domain. It is applied on
2971  * the range of the schedule (i.e., on the output of the schedule relation).
2972  *
2973  * For example, to shift the i dimension of the time-processor domain
2974  * of C0, you can apply the transformation
2975  *
2976  * C0[0, 0, i, 0, j, 0] -> C0[0, 0, i+2, 0, j, 0]
2977  *
2978  * To apply an interchange, you would do
2979  *
2980  * C0[0, 0, i, 0, j, 0] -> C0[0, 0, j, 0, i, 0]
2981  */
2982  void apply_transformation_on_schedule(std::string map_str);
2983 
2984  /**
2985  * \brief Schedule this computation to run before the computation \p consumer
2986  * at the loop level \p L.
2987  *
2988  * \details Notes
2989  * - The loop level \p L is a loop level of this computation.
2990  * - Use computation::root to indicate the root dimension
2991  * (i.e. the outermost time-space dimension).
2992  * - Calling this method with the same computations overwrites the level if it is
2993  * higher.
2994  * - A computation being scheduled after another computation at level L means it is
2995  * scheduled after that computation at all levels lower than L.
2996  * - There should be exactly one computation with no computation scheduled before it.
2997  * - Each other computation should have exactly one computation scheduled before it.
2998  */
2999  // @{
3000  void before(computation &consumer, tiramisu::var L);
3001  // @}
3002 
3003  /**
3004  * Schedule this computation to run after \p before_comp at the loop level \p before_l,
3005  * and before \p after_comp at loop level \p after_l. The outermost loop level is 0.
3006  *
3007  * Use computation::root_dimension to indicate the root dimension
3008  * (i.e. the outermost time-space dimension).
3009  *
3010  * If there was already a direct scheduling between \p before_comp and \p after_comp
3011  * (e.g. using before, after, between...), that schedule is overwritten; i.e. it no longer
3012  * exists/has an effect.
3013  *
3014  * Note that as with all other scheduling methods:
3015  * - Calling this method with the same computations overwrites the levels if they are
3016  * higher.
3017  * - A computation being scheduled after another computation at level L means it is
3018  * scheduled after that computation at all levels lower than L.
3019  * - There should be exactly one computation with no computation scheduled before it.
3020  * - Each other computation should have exactly one computation scheduled before it.
3021  */
3022  void between(computation &before_comp, tiramisu::var before_l, computation &after_comp, tiramisu::var after_l);
3023 
3024  /**
3025  * \brief Store this computation in \p buff
3026  *
3027  * \details
3028  *
3029  * Let us assume that we have a computation C:
3030  *
3031  * \code
3032  * {C[i]: 0<=i<N}
3033  * \endcode
3034  *
3035  * and that we want to store each C(i) in bufC[i]. Then we
3036  * can use store_in() to indicate that as follows:
3037  *
3038  * \code
3039  * C.store_in(&bufC)
3040  * \endcode
3041  *
3042  * This mans that each computation C(i) will be stored
3043  * in the buffer location bufC[i].
3044  *
3045  * If \p iterators is specified, the \p iterators are used to specify how the
3046  * computation is mapped to the buffer.
3047  * If the dimensions of this computation are in0, in1, ..., inn and if
3048  * \p iterators are equal to im0, im1, ..., imm then the computation is
3049  * mapped as follows
3050  *
3051  * \code
3052  * C[in0, in1, ..., inn]->bufC[im0, im1, ..., imm].
3053  * \endcode
3054  *
3055  * i.e., the computation C[in0, in1, ..., inn] is stored in bufC[im0,
3056  * im1, ..., imm].
3057  *
3058  * This can be used to store the data in many ways (reordering the
3059  * storage, storing into modulo buffers, ...).
3060  *
3061  * Assuming we have have computation D(i,j) that has the following
3062  * iteration domain:
3063  *
3064  * \code
3065  * {D[i,j]: 0<=i<N and 0<=j<N}
3066  * \endcode
3067  *
3068  * and assuming we have a buffer bufD.
3069  *
3070  * The store_in() function can be used to implement many types of data mappings:
3071  * - Store the computation D to a scalar: D.store_in(&bufD, {}).
3072  * This mans that D(i) will be stored in bufD[0] (which represents a
3073  * scalar).
3074  * - Store a 2 dimensional computation into a 1-dimensional
3075  * buffer: D.store_in(&bufD, {i});
3076  * - Change the order of storage.
3077  * D.store_in(&bufD, {j, i}) will store D(i,j) in bufD(j,i).
3078  * - Store the computation in a circular buffer (modulo storage).
3079  * D.store_in(&bufD, {i%4, j%4});
3080  * This will store D(i,j) in bufD[i%4, j%4]. Assuming the buffer
3081  * bufD is a 4x4 buffer.
3082  */
3083  // @{
3084  void store_in(buffer *buff);
3085  void store_in(buffer *buff, std::vector<expr> iterators);
3086  // }@
3087 
3088  /**
3089  * This function assumes that \p consumer consumes values produced by
3090  * this computation (which is the producer).
3091  *
3092  * Compute this computation as needed for each unique value of the
3093  * \p consumer.
3094  *
3095  * This computation is scheduled so that the values consumed by the
3096  * \p consumer are computed at the level \p L and in the same loop
3097  * nest of the consumer. \p L is a loop level in the consumer.
3098  *
3099  * If the consumer needs this computation to be computed redundantly,
3100  * the function creates the necessary redundant computations and schedules
3101  * them before the consumer.
3102  *
3103  * This function performs the following:
3104  * - schedules this computation to be executed as needed before
3105  * the consumer.
3106  * - if this computation needs to be computed redundantly, redundant
3107  * computations are create.
3108  *
3109  * This function does not:
3110  * - create any data mapping to this computation. It is up to the
3111  * user to provide an access relation to this computation as he
3112  * would do to any other normal computation.
3113  * - it does not allocate any buffer to this computation. It is
3114  * up to the user to declare a buffer where the results of this
3115  * computation will be stored.
3116  *
3117  * If this functions creates a duplicate of the computation, the user
3118  * does not need to set its access relation. The duplicated computation
3119  * will automatically have the same access relation as the original
3120  * computation. This access relation is set automatically.
3121  *
3122  * This function does not return a handler to manipulate the duplicate
3123  * computation. It does not allow the user to manipulate the duplicate
3124  * freely. The duplicate is scheduled automatically to be executed
3125  * before the consumer.
3126  */
3127  void compute_at(computation &consumer, tiramisu::var L);
3128  void compute_at(computation &consumer, int L);
3129 
3130  /**
3131  * Generates the time-space domain and construct an AST that scans that
3132  * time-space domain, then compute the depth of this AST.
3133  * This is useful for example to know if all the dimensions of the time-space
3134  * domain will correspond to a loop level in the final generated AST.
3135  */
3136  int compute_maximal_AST_depth();
3137 
3138  /**
3139  * Dump the iteration domain of the computation. This is useful for
3140  * debugging.
3141  */
3142  void dump_iteration_domain() const;
3143 
3144  /**
3145  * Dump the schedule of the computation. This is mainly useful for
3146  * debugging.
3147  *
3148  * The schedule is a relation between the iteration space and the
3149  * time space. The relation provides a logical date of execution for
3150  * each point in the iteration space.
3151  * The schedule needs first to be set before calling this function.
3152  */
3153  void dump_schedule() const;
3154 
3155  /**
3156  * Dump the computation on stdout. This is mainly useful for
3157  * debugging.
3158  */
3159  void dump() const;
3160 
3161  /**
3162  * Fuse this computation with the computation passed as argument
3163  * in the same loop. Run this computation after that computation.
3164  * Fuse them at the loop level \p lev.
3165  *
3166  * For example, assuming we have the following computations
3167  *
3168  * {S0[i,j]: 0<=i<N and 0<=j<N}, {S1[i,j]: 0<=i<N and 0<=j<N}
3169  * and {S2[i,j]: 0<=i<N and 0<=j<N}.
3170  *
3171  * Without fusion, these computations would be equivalent
3172  * to the following loops nests
3173  *
3174  * \code
3175  * for (i=0; i<N; i++)
3176  * for (j=0; j<N; j++)
3177  * S0;
3178  *
3179  * for (i=0; i<N; i++)
3180  * for (j=0; j<N; j++)
3181  * S1;
3182  *
3183  * for (i=0; i<N; i++)
3184  * for (j=0; j<N; j++)
3185  * S2;
3186  * \endcode
3187  *
3188  * To fuse them, one should call
3189  *
3190  * \code
3191  * S2.fuse_after(j, S1);
3192  * S1.fuse_after(j, S0);
3193  * \endcode
3194  *
3195  * This would result in fusing S2 with S0 and S1 at loop level j.
3196  * S2 will be scheduled for execution after S0 and S1. The resulting code would look like
3197  *
3198  * \code
3199  * for (i=0; i<N; i++)
3200  * for (j=0; j<N; j++)
3201  * {
3202  * S0;
3203  * S1;
3204  * S2;
3205  * }
3206  * \endcode
3207  *
3208  * Calling
3209  *
3210  * \code
3211  * S2.fuse_after(i, S1);
3212  * S1.fuse_after(i, S0);
3213  * \endcode
3214  *
3215  * would result in the following code
3216  *
3217  * \code
3218  * for (i=0; i<N; i++)
3219  * {
3220  * for (j=0; j<N; j++)
3221  * S0;
3222  * for (j=0; j<N; j++)
3223  * S1;
3224  * for (j=0; j<N; j++)
3225  * S2;
3226  * }
3227  * \endcode
3228  *
3229  */
3231  {
3232  assert(lev.get_name().size() > 0);
3233 
3234  this->after(comp, lev);
3235  }
3236 
3237  /**
3238  * Generate the time-space domain of the computation.
3239  *
3240  * In this representation, the logical time of execution and the
3241  * processor where the computation will be executed are both
3242  * specified. The memory location where computations will be
3243  * stored in memory is not specified at the level.
3244  */
3245  void gen_time_space_domain();
3246 
3247  /**
3248  * Specify that the rank loop iterator should be removed from linearization.
3249  */
3250  void drop_rank_iter(var level);
3251 
3252  /**
3253  * Get the data type of the computation.
3254  */
3255  tiramisu::primitive_t get_data_type() const;
3256 
3257  /**
3258  * Return the Tiramisu expression associated with the computation.
3259  */
3260  const tiramisu::expr &get_expr() const;
3261 
3262  /**
3263  * Return the iteration domain of the computation.
3264  * In this representation, the order of execution of computations
3265  * is not specified, the computations are also not mapped to memory.
3266  */
3267  isl_set *get_iteration_domain() const;
3268 
3269  /**
3270  * Get the last update of a computation.
3271  */
3272  tiramisu::computation& get_last_update();
3273 
3274  /**
3275  * Search the time-space domain (the range of the schedule) and
3276  * return the loop level number that correspond to the dimension
3277  * named \p dim.
3278  * In other words, translate the vector of dimension name (\p dim_name)
3279  * into a loop level number.
3280  */
3282  {
3283  return this->get_loop_level_numbers_from_dimension_names({dim_name})[0];
3284  }
3285 
3286  /**
3287  * Return the name of the computation.
3288  */
3289  const std::string &get_name() const;
3290 
3291  /**
3292  * Returns a pointer to the computation scheduled immediately before this computation,
3293  * or a null pointer if none exist.
3294  */
3295  computation * get_predecessor();
3296 
3297  /**
3298  * Returns the \p index update that has been added to this computation such that:
3299  * - If \p index == 0, then this computation is returned.
3300  * - If \p > 0, then it returns the pth computation added through add_definitions.
3301  */
3302  tiramisu::computation& get_update(int index);
3303 
3304  /**
3305  * Get the schedule of the computation.
3306  */
3307  isl_map *get_schedule() const;
3308 
3309  /**
3310  * Tile the computation and then tag the outermost tile dimension
3311  * to be mapped to GPU blocks and tag the innermost tile dimensions
3312  * to be mapped to GPU threads.
3313  *
3314  * Tile the two loop levels \p L0 and \p L1 with rectangular
3315  * tiling. \p sizeX and \p sizeY represent the tile size.
3316  * \p L0 and \p L1 should be two consecutive loop levels
3317  * (i.e., \p L0 = \p L1 + 1) and they should satisfy
3318  * \p L0 > \p L1.
3319  *
3320  * \p L0_outer, \p L1_outer, \p L0_inner, \p L1_inner
3321  * are the names of the new dimensions created after tiling.
3322  */
3323  // @{
3324  void gpu_tile(tiramisu::var L0, tiramisu::var L1, int sizeX, int sizeY);
3325  void gpu_tile(tiramisu::var L0, tiramisu::var L1, int sizeX, int sizeY,
3326  tiramisu::var L0_outer, tiramisu::var L1_outer,
3327  tiramisu::var L0_inner, tiramisu::var L1_inner);
3328  void gpu_tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, int sizeX, int sizeY, int sizeZ);
3329  void gpu_tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, int sizeX, int sizeY, int sizeZ,
3330  tiramisu::var L0_outer, tiramisu::var L1_outer, tiramisu::var L2_outer,
3331  tiramisu::var L0_inner, tiramisu::var L1_inner, tiramisu::var L2_inner);
3332  // @}
3333 
3334  /**
3335  * Return the buffer that was allocated automatically using
3336  * high level data mapping functions.
3337  * If no automatic buffer was allocated, this function returns NULL.
3338  */
3339  tiramisu::buffer *get_automatically_allocated_buffer();
3340 
3341  /**
3342  * Interchange (swap) the two loop levels \p L0 and \p L1.
3343  */
3344  void interchange(tiramisu::var L0, tiramisu::var L1);
3345 
3346  /**
3347  * Identical to
3348  * void interchange(tiramisu::var L0, tiramisu::var L1);
3349  */
3350  void interchange(int L0, int L1);
3351 
3352  /**
3353  * Mark this statement as a let statement.
3354  */
3355  void mark_as_let_statement();
3356 
3357  /**
3358  * Mark this statement as a library call.
3359  */
3360  void mark_as_library_call();
3361 
3362  /**
3363  * Tag the loop level \p L to be parallelized.
3364  *
3365  * This function is equivalent to the function \ref tiramisu::computation::tag_parallel_level() .
3366  * There is no difference between the two.
3367  *
3368  */
3369  void parallelize(tiramisu::var L);
3370 
3371  /**
3372  * Set the access relation of the computation.
3373  *
3374  * The access relation is a relation from computations to buffer locations.
3375  * \p access_str is a string that represents the relation. It is encoded
3376  * in the ISL format, http://isl.gforge.inria.fr/user.html#Sets-and-Relations
3377  * of relations.
3378  *
3379  * Note that, in TIramisu, the access relations of computation that have the same name
3380  * must be identical.
3381  *
3382  * Examples: tutorial_01, tutorial_02, tutorial_08 (actually most tutorials have set_access()).
3383  */
3384  // @{
3385  void set_access(std::string access_str);
3386  void set_access(isl_map *access);
3387  // @}
3388 
3389  void set_wait_access(std::string access_str);
3390  void set_wait_access(isl_map *access);
3391 
3392  /**
3393  * Set the expression of the computation.
3394  */
3395  void set_expression(const tiramisu::expr &e);
3396 
3397  /**
3398  * Sets whether the computation is inline or not, based on the value of \p is_inline.
3399  * If a computation is inline, accesses to the computation return the expression of that
3400  * computation.
3401  * E.g. if an inline computation S(i,j) is defined with the expression i + j,
3402  * then S(i + 1, j * i) returns the expression i + 1 + j * i.
3403  * If \p is_inline is not provided, the computation is set to be inline.
3404  */
3405  void set_inline(bool is_inline = true);
3406 
3407  /**
3408  * Returns true if and only if the computation is inline.
3409  */
3410  const bool is_inline_computation() const;
3411 
3412  /**
3413  * Set the schedule indicated by \p map.
3414  *
3415  * \p map is a string that represents a mapping from the iteration domain
3416  * to the time-space domain (the ISL format to represent maps is
3417  * documented in http://barvinok.gforge.inria.fr/barvinok.pdf in Sec 1.2.2).
3418  *
3419  * The schedule is a map from the iteration domain to a time space
3420  * domain. The same name of space should be used for both the range
3421  * and the domain of the schedule.
3422  *
3423  * In general, users can set the schedule using high level functions such
3424  * as before(), after(), tile(), compute_at(), vectorize(), split(), ...
3425  * The use of this function is only reserved for advanced users who want
3426  * a low level control of the schedule.
3427  *
3428  * Vectors in the time-space domain have the following form
3429  *
3430  * computation_name[redundancy_ID,static,dynamic,static,dynamic,static,dynamic,static,...]
3431  *
3432  * The first dimension of the vector is used to indicate the redundancy ID
3433  * (the notion of the redundancy ID is explained later).
3434  *
3435  * The following dimensions are interleaved dimensions: static, dynamic, static,
3436  * dynamic, ...
3437  * Dynamic dimensions represent the loop levels, while static dimensions are
3438  * used to order statements within a given block of statements in a given loop
3439  * level.
3440  * For example, the computations c0 and c1 in the following loop nest
3441  *
3442  * \code
3443  * for (i=0; i<N: i++)
3444  * for (j=0; j<N; j++)
3445  * {
3446  * c0;
3447  * c1;
3448  * }
3449  * \endcode
3450  *
3451  * have the following representations in the iteration domain
3452  *
3453  * \code
3454  * {c0[i,j]: 0<=i<N and 0<=j<N}
3455  * {c1[i,j]: 0<=i<N and 0<=j<N}
3456  * \endcode
3457  *
3458  * and the following representation in the time-space domain
3459  *
3460  * \code
3461  * {c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
3462  * {c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
3463  * \endcode
3464  *
3465  * The first dimension (dimension 0) in the time-space
3466  * domain (the leftmost dimension) is the redundancy ID
3467  * (in this case it is 0, the meaning of this ID will be explained later).
3468  * The second dimension (starting from the left) is a static dimension,
3469  * the third dimension is a dynamic dimension that
3470  * represents the loop level i, ..., the fifth dimension is a dynamic
3471  * dimension that represents the loop level j and the last dimension
3472  * (dimension 5) is a static dimension and allows the ordering of
3473  * c1 after c0 in the loop nest.
3474  *
3475  * To transform the previous iteration domain to the
3476  * time-space domain, the following schedule should be used:
3477  *
3478  * \code
3479  * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
3480  * {c1[i,j]->c1[0,0,i,0,j,1]: 0<=i<N and 0<=j<N}
3481  * \endcode
3482  *
3483  * The first dimension called "redundancy ID" is only meaningful if the
3484  * computation is redundant. i.e., some parts of the computation are
3485  * redundantly computed. Redundant computations are in general used to
3486  * maximize parallelism and data locality on the expense of doing some
3487  * computations redundantly.
3488  * For example, if the two computations c1(i,j) and c2(i,j) both depend
3489  * on the computation c0(i,j), instead of waiting for c0(i,j) to be
3490  * computed and then computing c1(i,j) and c2(i,j) in parallel, the thread
3491  * executing c1(i,j) can compute c0(i,j) by itself and then run c1(i,j).
3492  * The thread that computes c2(i,j) can do the same and compute c0(i,j)
3493  * by itself and then compute c2(i,j). In this case the two threads do not
3494  * need to wait. This is done at the expense of redundant computation since
3495  * c0(i,j) is computed by both threads.
3496  *
3497  * In general redundant computations are useful when tiling stencil
3498  * computations. In the context of stencils such a tiling is called
3499  * "overlapped tiling". Tiles that depend on results computed by other
3500  * tiles that run in parallel can compute the boundaries redundantly which
3501  * allows them to avoid waiting and thus can run in parallel.
3502  *
3503  * In Tiramisu, the user can indicate that a chunk of a computation
3504  * should be computed redundantly. The original computation always has a redundancy
3505  * ID equal to 0 (which means this is the original computation).
3506  * The redundant chunk has an ID that is different from 0 and that is
3507  * used to uniquely identify it.
3508  *
3509  * For example if we want to compute all of c0 three times (that is,
3510  * compute the original computation and compute two redundant computations),
3511  * we can use the following schedules:
3512  *
3513  * The schedule of the original computation: {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N}
3514  * The schedule of the redundant computation N°1: {c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N}
3515  * The schedule of the redundant computation N°2: {c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
3516  *
3517  * The schedule of c0 in this case would be three maps that map c0[i,j] to
3518  * the three different redundant computations in the time-processor domain:
3519  *
3520  * \code
3521  * {c0[i,j]->c0[0,0,i,0,j,0]: 0<=i<N and 0<=j<N;
3522  * c0[i,j]->c0[1,0,i,0,j,0]: 0<=i<N and 0<=j<N;
3523  * c0[i,j]->c0[2,0,i,0,j,0]: 0<=i<N and 0<=j<N}
3524  * \endcode
3525  *
3526  * The function set_schedule() overrides any other schedule set by the high level
3527  * scheduling functions. Currently the user has to choose between using the high
3528  * level scheduling functions or using this low level set_schedule function. The user
3529  * cannot mix the use of the two in the same program because they are not compatible.
3530  */
3531  // @{
3532  void set_low_level_schedule(isl_map *map);
3533  void set_low_level_schedule(std::string map_str);
3534  // @}
3535 
3536  /**
3537  * Shift the loop level \p L0 of the iteration space by
3538  * \p n iterations.
3539  *
3540  * \p n can be a positive or a negative number. A positive
3541  * number means a shift forward of the loop iterations while
3542  * a negative value would mean a shift backward.
3543  */
3544  void shift(tiramisu::var L0, int n);
3545 
3546  /**
3547  * Apply loop skewing on the loop levels \p i and \p j with a skewing factor of \p f.
3548  * The names of the new loop levels is \p ni and \p nj.
3549  *
3550  * This command transforms the loop (i, j) into the loop (i, f*i+j).
3551  * For example if you have the following loop
3552  *
3553  * \code
3554  for (int i = 1; i < N; i++)
3555  for (int j = 1; j < M; j++)
3556  a[i][j] = a[i - 1][j] + a[i][j - 1];
3557  \endcode
3558 
3559  * and apply
3560 
3561  \code
3562  a.skew(i, j, 1, ni, nj);
3563  \endcode
3564 
3565  * you would get
3566  *
3567  \code
3568  for (int i = 1; i < N; i++)
3569  for (int j = i+1; j < i+M; j++)
3570  a[i][j - i] = a[i - 1][j - i] + a[i][j - i - 1];
3571  \endcode
3572 
3573  */
3574  void skew(tiramisu::var i, tiramisu::var j, int f, tiramisu::var ni, tiramisu::var nj);
3575 
3576  /**
3577  * Apply loop skewing on the loop levels \p i, \p j and \p k with a skewing factor of \p f.
3578  * The names of the new loop levels is \p ni, \p nj and \p nk.
3579  *
3580  * This command transforms the loop (i, j, k) into the loop (i, f*i+j, f*i+k).
3581  */
3582  void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, int factor,
3584 
3585  /**
3586  * Apply loop skewing on the loop levels \p i, \p j, \p k, \p l with a skewing factor of \p f.
3587  * The names of the new loop levels is \p ni, \p nj, \p nk and \p nl.
3588  *
3589  * This command transforms the loop (i, j, k, l) into the loop (i, f*i+j, f*i+k, f*i+l).
3590  */
3591  void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, tiramisu::var l, int factor,
3593 
3594  /**
3595  * \overload
3596  */
3597  void skew(tiramisu::var i, tiramisu::var j, int factor);
3598 
3599  /**
3600  * \overload
3601  */
3602  void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, int factor);
3603 
3604  /**
3605  * \overload
3606  */
3607  void skew(tiramisu::var i, tiramisu::var j, tiramisu::var k, tiramisu::var l, int factor);
3608 
3609  /**
3610  * \overload
3611  */
3612  void skew(int i, int j, int factor);
3613 
3614  /**
3615  * \overload
3616  */
3617  void skew(int i, int j, int k, int factor);
3618 
3619  /**
3620  * \overload
3621  */
3622  void skew(int i, int j, int k, int l, int factor);
3623 
3624  /**
3625  * Split the loop level \p L0 of the iteration space into two
3626  * new loop levels.
3627  *
3628  * \p sizeX is the extent (size) of the inner loop created after
3629  * splitting.
3630  */
3631  //@{
3632  void split(tiramisu::var L0, int sizeX);
3633  void split(tiramisu::var L0, int sizeX,
3634  tiramisu::var L0_outer, tiramisu::var L0_inner);
3635  //@}
3636 
3637  /**
3638  * Identical to
3639  * void split(tiramisu::var L0, int sizeX);
3640  */
3641  void split(int L0, int sizeX);
3642 
3643  /**
3644  * Fold the storage of the computation.
3645  * Fold the loop level \p dim by a factor \p f.
3646  */
3647  void storage_fold(tiramisu::var dim, int f);
3648 
3649  /**
3650  * Allocate the storage of this computation in the loop level \p L0.
3651  *
3652  * This function does the following:
3653  * - computes the size of the buffer needed to store this computation
3654  * (TODO: current the size computed by Tiramisu is equal to the size
3655  * of the computation, Tiramisu does not allocate smaller buffers if
3656  * such a thing is possible, this is left for future work).
3657  * - allocates a temporary buffer with the appropriate size,
3658  * - schedules the allocation operation to be executed in the loop
3659  * nest where \p comp is computated at the loop level \p L0.
3660  *
3661  * The function returns the computation (operation) that allocates
3662  * the buffer. The allocated buffer is not returned.
3663  */
3665  tiramisu::var L0);
3666 
3667  /**
3668  * Tag the loop level \p L0 and \p L1 to be mapped to GPU.
3669  */
3670  // @{
3671  void tag_gpu_level(tiramisu::var L0, tiramisu::var L1);
3672  void tag_gpu_level(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, tiramisu::var L3);
3673  void tag_gpu_level(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2, tiramisu::var L3, tiramisu::var L4, tiramisu::var L5);
3674  // @}
3675 
3676  /**
3677  * Tag the loop level \p L to be parallelized.
3678  */
3679  void tag_parallel_level(tiramisu::var L);
3680 
3681  /**
3682  * Identical to
3683  * void tag_parallel_level(int L);
3684  */
3685  void tag_parallel_level(int L);
3686 
3687  /**
3688  * Tag the loop level \p L to be vectorized.
3689  * \p len is the vector length.
3690  *
3691  * The user can only tag loop levels that have constant extent.
3692  * If a loop level does not have a constant extent, the user
3693  * should call .vectorize() command instead or he can call
3694  * separate() and split() manually.
3695  *
3696  * The user has to make sure that the extent of the dimension
3697  * is bigger than \p len. The vectorization of a loop that has
3698  * less than \p len iterations is not correct.
3699  *
3700  */
3701  void tag_vector_level(tiramisu::var L, int len);
3702 
3703  /**
3704  * Identical to
3705  * void tag_vector_level(tiramisu::var L, int len);
3706  */
3707  void tag_vector_level(int L, int len);
3708 
3709  /**
3710  * Tag the loop level \p L to be distributed.
3711  */
3712  // @{
3713  void tag_distribute_level(tiramisu::var L);
3714  void tag_distribute_level(int L);
3715  // @}
3716 
3717  /**
3718  * Tag the loop level \p L to be unrolled.
3719  *
3720  * The user can only tag loop levels that have constant extent.
3721  */
3722  void tag_unroll_level(tiramisu::var L);
3723 
3724  /**
3725  * Identical to
3726  * void tag_unroll_level(tiramisu::var L);
3727  */
3728  void tag_unroll_level(int L);
3729 
3730  /**
3731  * \brief Schedule this computation to run before the computation \p next_computation
3732  * at the loop level \p L and return \p next_computation.
3733  *
3734  * \details Notes
3735  * - This method is a simple wrapper around computation::after to help schedule
3736  * chaining as in:
3737  * \code
3738  * C1.then(C2, j)
3739  * .then(C3, computation::root)
3740  * .then(C4, i)
3741  * .then(C5, j);
3742  * \endcode
3743  * - The loop level \p L is a loop level of this computation.
3744  * - Use computation::root to indicate the root dimension
3745  * (i.e. the outermost time-space dimension).
3746  * - Calling this method with the same computations overwrites the level if it is
3747  * higher.
3748  * - A computation being scheduled after another computation at level L means it is
3749  * scheduled after that computation at all levels lower than L.
3750  */
3751  // @{
3752  computation &then(computation &next_computation, tiramisu::var L);
3753  // @}
3754 
3755  /**
3756  * Tile the two loop levels \p L0 and \p L1 with rectangular
3757  * tiling. \p sizeX and \p sizeY represent the tile size.
3758  * \p L0 and \p L1 should be two consecutive loop levels.
3759  * \p L0_outer, \p L1_outer, \p L0_inner, \p L1_inner
3760  * are the names of the new dimensions created after tiling.
3761  */
3762  // @{
3763  void tile(tiramisu::var L0, tiramisu::var L1,
3764  int sizeX, int sizeY);
3765  void tile(tiramisu::var L0, tiramisu::var L1,
3766  int sizeX, int sizeY,
3767  tiramisu::var L0_outer, tiramisu::var L1_outer,
3768  tiramisu::var L0_inner, tiramisu::var L1_inner);
3769  void tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2,
3770  int sizeX, int sizeY, int sizeZ);
3771  void tile(tiramisu::var L0, tiramisu::var L1, tiramisu::var L2,
3772  int sizeX, int sizeY, int sizeZ,
3773  tiramisu::var L0_outer, tiramisu::var L1_outer,
3774  tiramisu::var L2_outer, tiramisu::var L0_inner,
3775  tiramisu::var L1_inner, tiramisu::var L2_inner);
3776  // @}
3777 
3778  /**
3779  * Tile the two loop levels \p L0 and \p L1 with rectangular
3780  * tiling. \p sizeX and \p sizeY represent the tile size.
3781  * \p L0 and \p L1 should be two consecutive loop levels
3782  * (i.e., \p L0 = \p L1 + 1) and they should satisfy
3783  * \p L0 > \p L1.
3784  */
3785  // @{
3786  void tile(int L0, int L1, int sizeX, int sizeY);
3787  void tile(int L0, int L1, int L2, int sizeX, int sizeY, int sizeZ);
3788  // @}
3789 
3790  /**
3791  * Unroll the loop level \p L with an unrolling factor \p fac.
3792  *
3793  * The difference between this function and the function
3794  * tag_unroll_level() is that this function separates
3795  * the iteration domain into full and partial iteration
3796  * domains for unrolling first and then it calls
3797  * tag_unroll_level().
3798  * tag_unroll_level() only tags a dimension to
3799  * be unrolled, it does not modify the tagged dimension.
3800  *
3801  * This function separates the iteration domain into two iteration
3802  * domains, a full iteration domain and a partial iteration domain.
3803  * The full iteration domain has an upper bound that is multiple of
3804  * \p fac while the other does not.
3805  * The full iteration domain is then split by \p fac and the inner loop
3806  * (which should have a constant extent equal to \p fac) is tagged as
3807  * a unrolled loop.
3808  *
3809  * Let us assume the following loop (a loop represents and iteration
3810  * domain)
3811  *
3812  * \code
3813  * for (i=0; i<N; i++)
3814  * for (j=0; j<23; j++)
3815  * S0;
3816  * \endcode
3817  *
3818  * To unroll the j loop with an unrolling factor of 4, one should call
3819  *
3820  * S0.unroll(j, 4);
3821  *
3822  * The loop (iteration domain) is first separated into the following
3823  * two loops
3824  *
3825  * \code
3826  * for (int i=0; i<20; i++)
3827  * S0;
3828  *
3829  * for (int i=20; i<23; i++)
3830  * S0;
3831  * \endcode
3832  *
3833  * The full loop is then split by 4
3834  *
3835  * \code
3836  * for (int i1=0; i1<20/4; i1++)
3837  * for (int i2=0; i2<4; i2++)
3838  * S0;
3839  *
3840  * for (int i=20; i<23; i++)
3841  * S0;
3842  * \endcode
3843  *
3844  * the i2 loop is then tagged to be unrolled.
3845  *
3846  * \p L_outer and \p L_inner are the names of the new loops created
3847  * after splitting. If not provided, default names will be assigned.
3848  * \p L_outer is the outer loop.
3849  *
3850  */
3851  //@{
3852  void unroll(tiramisu::var L, int fac);
3853  void unroll(tiramisu::var L, int fac, tiramisu::var L_outer, tiramisu::var L_inner);
3854  //@}
3855 
3856  /**
3857  * Vectorize the loop level \p L. Use the vector length \p v.
3858  *
3859  * The difference between this function and the function
3860  * tag_vector_level() is that this function
3861  * prepares the iteration domain for vectorization first
3862  * and then it calls tag_vector_level().
3863  * tag_vector_level() only tags a dimension to
3864  * be vectorized, it does not change the tagged dimension.
3865  *
3866  * This function will separate the iteration domain into two iteration
3867  * domains, a full iteration domain and a partial iteration domain.
3868  * The full iteration domain has an upper bound that is multiple of
3869  * \p v while the other does not.
3870  * The full iteration domain is then split by \p v and the inner loop
3871  * (which should have a constant extent equal to \p v) is tagged as
3872  * a vector loop.
3873  *
3874  * Let us assume the following loop (a loop represents and iteration
3875  * domain)
3876  *
3877  * \code
3878  * for (i=0; i<N; i++)
3879  * for (j=0; j<23; j++)
3880  * S0;
3881  * \endcode
3882  *
3883  * To vectorize the j loop with a vector length 4, one should call
3884  *
3885  * S0.vectorize(j, 4);
3886  *
3887  * The loop (iteration domain) is first separated into the following
3888  * two loops
3889  *
3890  * \code
3891  * for (int i=0; i<20; i++)
3892  * S0;
3893  *
3894  * for (int i=20; i<23; i++)
3895  * S0;
3896  * \endcode
3897  *
3898  * The full loop is then split by 4
3899  *
3900  * \code
3901  * for (int i1=0; i1<20/4; i1++)
3902  * for (int i2=0; i2<4; i2++)
3903  * S0;
3904  *
3905  * for (int i=20; i<23; i++)
3906  * S0;
3907  * \endcode
3908  *
3909  * the i2 loop is then tagged to be vectorized.
3910  *
3911  * The user has to make sure that the extent of the dimension
3912  * is bigger than \p v. The vectorization of a loop that has
3913  * less than \p v iterations is not correct.
3914  *
3915  * The names of the new loop iterators created after vectorization
3916  * are \p L_outer and \p L_inner. If not provided, default names
3917  * assigned.
3918  */
3919  // @{
3920  void vectorize(tiramisu::var L, int v);
3921  void vectorize(tiramisu::var L, int v, tiramisu::var L_outer, tiramisu::var L_inner);
3922  // @}
3923 
3924  /**
3925  * root_dimension is a number used to specify the dimension level
3926  * known as root.
3927  * The root dimension level is the outermost level. It is the level
3928  * outside any loop nest. Loop level 0 is the level of the first loop
3929  * (outermost loop), loop 1 is the level of following inner loop, ...
3930  *
3931  * Where is this number used ?
3932  *
3933  * These numbers are used in the helper functions used for scheduling
3934  * (such as after(), before(), ...).
3935  * For example, c0.after(c1) indicates that the computation c0 should
3936  * be executed after the computation c1.
3937  * Since the two computations c0 and c1 are usually nested in a loop,
3938  * we need to specify at which loop level c0 is after c1. This is where
3939  * we need to specify the loop level numbers.
3940  * Here is an example. Suppose that the two computations c0 and c1
3941  * have the following iteration domains
3942  * {c0[i,j]: 0<=i<N and 0<=j<N} and {c1[i,j]: 0<=i<N and 0<=j<N}.
3943  *
3944  * When code is generated for the two computations, two loop nests
3945  * are generated. When scheduling c0 after c1 using the after function,
3946  * the user can choose one among three possibilities in specifying at
3947  * which level c0 is after c1.
3948  *
3949  * - c0.after(c1, computation::root_dimension) would create a schedule
3950  * that generates the following code
3951  *
3952  * \code
3953  * for (i=0; i<N; i++)
3954  * for (j=0; j<N; j++)
3955  * c1;
3956  * for (i=0; i<N; i++)
3957  * for (j=0; j<N; j++)
3958  * c0;
3959  * \endcode
3960  *
3961  * - c0.after(c1, 0) would create a schedule that generates the
3962  * following code
3963  *
3964  * \code
3965  * for (i=0; i<N; i++) {
3966  * for (j=0; j<N; j++)
3967  * c1;
3968  * for (j=0; j<N; j++)
3969  * c0;
3970  * }
3971  * \endcode
3972  *
3973  * This means that c0 is after c1 starting from loop level 0,
3974  * (before the loop level 0, c0 and c1 have the same order).
3975  *
3976  * - c0.after(c1, 1) would create a schedule that generates the
3977  * following code
3978  *
3979  * \code
3980  * for (i=0; i<N; i++)
3981  * for (j=0; j<N; j++) {
3982  * c1;
3983  * c0;
3984  * }
3985  * \endcode
3986  *
3987  * This means that c0 is after c1 starting from loop level 1,
3988  * (before the loop level 1, c0 and c1 have the same order).
3989  */
3990  const static var root;
3991 
3992  /**
3993  * Equivalent of computation::root but to be used with scheduling
3994  * functions that take loop level (integers) as input instead of
3995  * tiramisu::var.
3996  */
3997  const static int root_dimension = -1;
3998 
3999  /**
4000  * Access operator: C0(i,j) represents an access to
4001  * the element (i,j) of the computation C0.
4002  * C0(i,j) represents the value computed by the computation
4003  * C0(i,j)
4004  */
4005  template<typename... Args> tiramisu::expr operator()(Args... args)
4006  {
4007  // TODO move to cpp
4008  std::vector<tiramisu::expr> access_expressions{std::forward<Args>(args)...};
4009  if (access_expressions.size() != number_of_dims)
4010  {
4011  tiramisu::str_dump("Error - Incorrect access: " + this->get_name() + "(");
4012  for (int i = 0; i < access_expressions.size(); i++)
4013  {
4014  tiramisu::expr e = access_expressions[i];
4015  e.dump(false);
4016  if (i != access_expressions.size() - 1)
4017  tiramisu::str_dump(", ");
4018  }
4019  tiramisu::str_dump(").\n");
4020  tiramisu::str_dump("The number of access dimensions does not match that used in the declaration of " + this->get_name() + ".\n\n");
4021  exit(1);
4022  }
4023 
4024  if (this->is_inline_computation()) {
4025  std::vector<std::pair<var, expr>> substitutions;
4026  for (auto const &variable: this->access_variables) {
4027  // variable is an (index, variable_name) pair
4028  substitutions.push_back(std::make_pair(var(variable.second, false),
4029  access_expressions[variable.first]));
4030  }
4031  // TODO add iteration space for expression
4032  return this->expression.substitute(substitutions);
4033  } else {
4035  this->get_name(),
4036  access_expressions,
4037  this->get_data_type());
4038  }
4039  }
4040 
4041  operator expr();
4042 
4043  static xfer create_xfer(std::string send_iter_domain, std::string recv_iter_domain, tiramisu::expr send_dest,
4044  tiramisu::expr recv_src, xfer_prop send_prop, xfer_prop recv_prop,
4045  tiramisu::expr send_expr, tiramisu::function *fct);
4046 
4047  static xfer create_xfer(std::string iter_domain, xfer_prop prop, tiramisu::expr expr,
4048  tiramisu::function *fct);
4049 
4050 };
4051 
4052 class input: public computation
4053 {
4054 
4055 public:
4056  /**
4057  * \brief Constructor for an input.
4058  *
4059  * \details
4060  *
4061  * Declare an input.
4062  *
4063  * \p name is the name of the input.
4064  *
4065  * \p iterator_variables is a vector that represents the dimensions of
4066  * the input. It is used to define the size of the input.
4067  *
4068  * \p t is the type of the input elements.
4069  * Example of types include (p_uint8, p_uint16, p_uint32, ...).
4070  * Types are defined in \ref tiramisu::primitive_t
4071  *
4072  * Example:
4073  *
4074  * To declare a buffer buf[20, 30] where the buffer elements
4075  * are of type uint8. We can first declare two iterator variables
4076  *
4077  * \code
4078  * var i("i", 0, 20), j("j", 0, 30);
4079  * \endcode
4080  *
4081  * and then we can declare the following input
4082  *
4083  * \code
4084  * input A("A", {i,j}, p_uint8);
4085  * \endcode
4086  *
4087  * Later in the code (in Layer III), we need to actually declare
4088  * the buffer and map this input to that buffer.
4089  *
4090  * An example is provided in tutorial 02.
4091  *
4092  */
4093  input(std::string name, std::vector<var> iterator_variables, primitive_t t):
4094  computation(name, iterator_variables, expr(), false)
4095  {
4096  this->data_type = t;
4097  this->expression.dtype = t;
4098  }
4099 
4100  /**
4101  * \overload
4102  */
4103  input(std::vector<var> iterator_variables, primitive_t t):
4104  input(generate_new_computation_name(), iterator_variables, t)
4105  {
4106  }
4107 };
4108 
4109 class Input: public input{
4110 
4111 public:
4112  std::vector<var> iterators_from_size_expressions(std::vector<expr> sizes)
4113  {
4114  std::vector<var> iterator_variables;
4115 
4116  for (int s = 0; s < sizes.size(); s++)
4117  {
4118  var v(generate_new_computation_name(), 0, sizes[s]);
4119  iterator_variables.push_back(v);
4120  }
4121 
4122  return iterator_variables;
4123  }
4124 
4125  /**
4126  * \overload
4127  */
4128  Input(std::string name, std::vector<expr> sizes, primitive_t t)
4129  :input(name, iterators_from_size_expressions(sizes), t)
4130  {
4131  }
4132 };
4133 
4134 class view:public computation{
4135 
4136 public:
4137  /**
4138  * \brief Constructor for a view.
4139  *
4140  * \details
4141  *
4142  * Declare a view.
4143  *
4144  * \p name is the name of the view.
4145  *
4146  * \p iterator_variables is a vector that represents the dimensions of
4147  * the view. It is used to define the size of the view.
4148  *
4149  * \p t is the type of buffer data (p_float32 for example).
4150  *
4151  * Example:
4152  *
4153  * If we want to access a buffer buf where results of a computation C are stored
4154  * we declare a view V and we map it to the buffer buf.
4155  *
4156  * We declare the iterator variables
4157  *
4158  * \code
4159  * var i("i", 0, 20), j("j", 0, 30);
4160  * \endcode
4161  *
4162  * and then we can declare the following view
4163  *
4164  * \code
4165  * view V("V", {i,j}, p_float64);
4166  * \endcode
4167  *
4168  * Later in the code (in Layer III), we need to map the view to that buffer.
4169  * \code
4170  * V.store_in(&buf);
4171  * \endcode
4172  */
4173  view(std::string name, std::vector<var> iterator_variables, primitive_t t):
4174  computation(name, iterator_variables, expr(), false,t){}
4175 
4176 };
4177 
4178 
4179 
4180 
4181 /**
4182  * A class that represents loop invariants.
4183  *
4184  * An object of the invariant class can be an expression, a symbolic constant
4185  * or a variable that is invariant to all the loops of the function.
4186  */
4187 class constant: public computation
4188 {
4189 private:
4190  /**
4191  * If this constant is not function wide, i.e., if it is computed
4192  * with a computation, then \p compute_with_computation is a pointer on the
4193  * computation with whom this constant should be computed.
4194  * We need to know this because we need to translate the accesses
4195  * used within the RHS of this contant expression to the new scheduled
4196  * accesses. Since a computation does not have a buffer access (it is
4197  * translated into a let statement), it also does not have an iterator
4198  * map, thus it cannot translate its accesses to scheduled accesses.
4199  * Thus we use the iterator map of the computation with whome we
4200  * compute this constant and take its iterator map.
4201  */
4202  tiramisu::computation *compute_with_computation;
4203 
4204 public:
4205 
4206  /**
4207  * Create a constant where \p param_name is the name of
4208  * the constant that will hold the value of the constant.
4209  *
4210  * \p param_expr is the expression that defines the value
4211  * of the constant.
4212  *
4213  * \p t indicates the type of the constant.
4214  *
4215  * \p function_wide should be set to true if the constant is
4216  * defined at the entry of the function and is visible to all
4217  * the computations.
4218  * If function_wide is set to true, then the constant is an
4219  * invariant to the whole function where it is declared.
4220  *
4221  * \p with_computation, should be set only if function_wide
4222  * is false, i.e. if the constant is not function wide.
4223  * In such a case the user should indicate where the
4224  * constant should be assigned.
4225  * \p with_computation indicates that the assignment should
4226  * be in the loop nest that computes the computation indicated by
4227  * \p with_computation at the loop level indicated
4228  * by \p at_loop_level.
4229  * The root level (i.e. the level outer than any other loop level)
4230  * is computation::root_dimension.
4231  * 0 represents the first loop level and 1 represents the second
4232  * loop level, ...
4233  *
4234  * \p func is the function in which the constant is defined. If this
4235  * argument is not provided, the implicit Tiramisu function is used
4236  * (i.e., the function created during Tiramisu initialization).
4237  */
4238  constant(std::string param_name, const tiramisu::expr &param_expr,
4240  bool function_wide,
4241  tiramisu::computation *with_computation,
4242  int at_loop_level,
4244 
4245  constant(std::string param_name, const tiramisu::expr &param_expr,
4248 
4249  /**
4250  * \brief Create a constant.
4251  *
4252  * \details
4253  *
4254  * Define a constant (scalar) at the entry of the function and make it visible to all
4255  * the computations.
4256  *
4257  * \p param_name is the name of the constant.
4258  * \p param_expr is the expression that defines the value of the constant.
4259  *
4260  */
4261  constant(std::string param_name, const tiramisu::expr &param_expr);
4262 
4263  /**
4264  * Get the computation with whom this constant is computed.
4265  */
4267  {
4268  return compute_with_computation;
4269  }
4270 
4271  /**
4272  * Dump the invariant on standard output (dump most of the fields of
4273  * the invariant class).
4274  * This is mainly useful for debugging.
4275  * If \p exhaustive is set to true, all the fields of the invariant
4276  * class are printed. This is useful to find potential initialization
4277  * problems.
4278  */
4279  void dump(bool exhaustive) const;
4280 
4281  operator expr();
4282 };
4283 
4284 
4285 /**
4286 * A class for code generation.
4287 */
4289 {
4290  friend function;
4291  friend computation;
4292  friend buffer;
4293  friend cuda_ast::generator;
4294 
4295 protected:
4296 
4297  /*
4298  * Compute the iterators map.
4299  * The iterators map is map between the original names of the iterators of a computation
4300  * and their transformed form after schedule (also after renaming).
4301  *
4302  * If in the original computation, we had
4303  *
4304  * {C[i0, i1]: ...}
4305  *
4306  * And if in the generated code, the iterators are called c0, c1, c2 and c3 and
4307  * the loops are tiled, then the map will be
4308  *
4309  * {<i0, c0*10+c2>, <i1, c1*10+c3>}.
4310  *
4311  **/
4312  static std::map<std::string, isl_ast_expr *>
4313  compute_iterators_map(tiramisu::computation *comp, isl_ast_build *build);
4314 
4315  /**
4316  * Get the computation associated with a node.
4317  */
4318  static std::vector<tiramisu::computation *>
4319  get_computation_by_node(tiramisu::function *fct, isl_ast_node *node);
4320 
4321  /**
4322  * Traverse the vector of computations \p comp_vec and return the computations
4323  * that have a domain that intersects with \p domain.
4324  */
4325  static std::vector<tiramisu::computation *> filter_computations_by_domain(std::vector<tiramisu::computation *> comp_vec,
4326  isl_union_set *node_domain);
4327 
4328  /**
4329  * Compute the accesses of the RHS of the computation
4330  * \p comp and store them in the accesses vector.
4331  *
4332  * If \p return_buffer_accesses is set to true, this function returns access functions to
4333  * buffers. Otherwise it returns access functions to computations.
4334  */
4335  static void get_rhs_accesses(const tiramisu::function *func, const tiramisu::computation *comp,
4336  std::vector<isl_map *> &accesses, bool return_buffer_accesses);
4337 
4338  /**
4339  * Analyze the \p access_expression and return a set of constraints
4340  * that correspond to the access pattern of the access_expression.
4341  *
4342  * access_dimension:
4343  * The dimension of the access. For example, the access
4344  * C0(i0, i1, i2) have three access dimensions: i0, i1 and i2.
4345  * access_expression:
4346  * The expression of the access.
4347  * This expression is parsed recursively (by calling get_constraint_for_access)
4348  * and is gradually used to update the constraint.
4349  * access_relation:
4350  * The access relation that represents the access.
4351  * cst:
4352  * The constraint that defines the access and that is being constructed.
4353  * Different calls to get_constraint_for_access modify this constraint
4354  * gradually until the final constraint is created. Only the final constraint
4355  * is added to the access_relation.
4356  * coeff:
4357  * The coefficient in which all the dimension coefficients of the constraint
4358  * are going to be multiplied. This coefficient is used to implement o_minus,
4359  * o_mul and o_sub.
4360  */
4361  static isl_constraint *get_constraint_for_access(int access_dimension,
4362  const tiramisu::expr &access_expression,
4363  isl_map *access_relation,
4364  isl_constraint *cst,
4365  int coeff,
4366  const tiramisu::function *fct);
4367 
4368 
4369  /**
4370  * Generate a Halide statement from an ISL ast node object in the ISL ast
4371  * tree.
4372  * Level represents the level of the node in the schedule. 0 means root.
4373  * It taks as input:
4374  * - a function \p fct for which we are generating code,
4375  * - a \p node,
4376  * - \p level represents the current loop level being traversed (0 means the outer level.
4377  * - \p is_a_child_block indicates whether the block that is ging to be
4378  * generated is a child block for an other block. In such a case, allocate
4379  * and let statements should not be generate. Allocate and let statements
4380  * should only be generated in non-child blocks so that their scope reaches
4381  * the whole block.
4382  */
4383  static Halide::Internal::Stmt halide_stmt_from_isl_node(const tiramisu::function &fct, isl_ast_node *node,
4384  int level,
4385  std::vector<std::pair<std::string, std::string>> &tagged_stmts,
4386  bool is_a_child_block);
4387 
4388  // TODO doc
4389  static Halide::Internal::Stmt make_halide_block(const Halide::Internal::Stmt &first,
4390  const Halide::Internal::Stmt &second);
4391 
4392  static Halide::Internal::Stmt make_buffer_alloc(buffer *b, const std::vector<Halide::Expr> &extents,
4393  Halide::Internal::Stmt &stmt);
4394  static Halide::Internal::Stmt make_buffer_free(buffer *b);
4395 
4396  /**
4397  * Create a Halide expression from a Tiramisu expression.
4398  */
4399  static Halide::Expr halide_expr_from_tiramisu_expr(const tiramisu::function *fct,
4400  std::vector<isl_ast_expr *> &index_expr,
4401  const tiramisu::expr &tiramisu_expr, tiramisu::computation *comp = nullptr);
4402 
4403  static tiramisu::expr replace_accesses(const tiramisu::function * func, std::vector<isl_ast_expr *> &index_expr,
4404  const tiramisu::expr &tiramisu_expr);
4405  static std::string get_buffer_name(const tiramisu::computation *);
4406  static tiramisu::expr comp_to_buffer(tiramisu::computation * comp, std::vector<isl_ast_expr *> &ie,
4407  const tiramisu::expr * expr = nullptr);
4408 
4409  /**
4410  * Linearize a multidimensional access to a Halide buffer.
4411  * Supposing that we have buf[N1][N2][N3], transform buf[i][j][k]
4412  * into buf[k + j*N3 + i*N3*N2].
4413  * Note that the first arg in index_expr is the buffer name. The other args
4414  * are the indices for each dimension of the buffer.
4415  */
4416  //@{
4417  static Halide::Expr linearize_access(int dims, const halide_dimension_t *shape, isl_ast_expr *index_expr);
4418  static Halide::Expr linearize_access(int dims, const halide_dimension_t *shape, std::vector<tiramisu::expr> index_expr);
4419  static Halide::Expr linearize_access(int dims, std::vector<Halide::Expr> &strides, std::vector<tiramisu::expr> index_expr);
4420  static Halide::Expr linearize_access(int dims, std::vector<Halide::Expr> &strides, isl_ast_expr *index_expr);
4421  static tiramisu::expr linearize_access(int dims, std::vector<tiramisu::expr> &strides, std::vector<tiramisu::expr> index_expr);
4422  static tiramisu::expr linearize_access(int dims, std::vector<tiramisu::expr> &strides, isl_ast_expr *index_expr);
4423  //@}
4424 
4425  /**
4426  * Retrieve the access function of the ISL AST leaf node (which represents a
4427  * computation). Store the access in computation->access.
4428  */
4429  static isl_ast_node *stmt_code_generator(isl_ast_node *node, isl_ast_build *build, void *user);
4430 
4431  /**
4432  * Traverse a tiramisu expression (\p exp) and extract the access relations
4433  * from the access operation passed in \p exp. The access relations are added
4434  * to the vector \p accesses.
4435  * The access relation is from the domain of the computation \p comp to the
4436  * domain of the computation accessed by the access operation.
4437  * If \p return_buffer_accesses = true, an access to a buffer is created
4438  * instead of an access to computations.
4439  */
4440  static void traverse_expr_and_extract_accesses(const tiramisu::function *fct,
4441  const tiramisu::computation *comp,
4442  const tiramisu::expr &exp,
4443  std::vector<isl_map *> &accesses,
4444  bool return_buffer_accesses);
4445 
4446  /**
4447  * Traverse a tiramisu expression (\p current_exp) until an expression with the specified name is found.
4448  * Replace that name with a new name. Replaces all occurrences.
4449  */
4450  static void _update_producer_expr_name(tiramisu::expr &current_exp, std::string name_to_replace,
4451  std::string replace_with);
4452 
4453 public:
4454 
4455  /**
4456  * Traverse a tiramisu expression (\p current_exp) until an expression with the specified name is found.
4457  * Replace that name with a new name. Replaces all occurrences.
4458  */
4459  static void update_producer_expr_name(tiramisu::computation *comp, std::string name_to_replace,
4460  std::string replace_with);
4461 };
4462 
4463 /**
4464  * A class containing utility functions.
4465  */
4466 class utility
4467 {
4468 public:
4469  /**
4470  * Traverse recursively the ISL AST tree
4471  *
4472  * \p node represents the root of the tree to be traversed.
4473  *
4474  * \p dim is the dimension of the loop from which the bounds have to be
4475  * extracted.
4476  *
4477  * \p upper is a boolean that should be set to true to extract
4478  * the upper bound and false to extract the lower bound.
4479  */
4480  static expr extract_bound_expression(isl_ast_node *ast, int dim, bool upper);
4481 
4482  /**
4483  * Return a tiramisu::expr representing the bound of
4484  * the dimension \p dim in \p set. If \p upper is true
4485  * then this function returns the upper bound otherwise
4486  * it returns the lower bound.
4487  *
4488  * For example, assuming that
4489  *
4490  * S = {S[i,j]: 0<=i<N and 0<=j<N and i<M}
4491  *
4492  * then
4493  *
4494  * get_upper_bound(S, 1)
4495  *
4496  * would return N-1, while
4497  *
4498  * get_upper_bound(S, 0)
4499  *
4500  * would return min(N-1,M-1)
4501  */
4502  static tiramisu::expr get_bound(isl_set *set, int dim, int upper);
4503 
4504  /**
4505  * Create a comma separated string that represents the list
4506  * of the parameters of \p set.
4507  *
4508  * For example, if the set is
4509  *
4510  * [N,M,K]->{S[i]}
4511  *
4512  * this function returns the string "N,M,K".
4513  */
4514  static std::string get_parameters_list(isl_set *set);
4515 
4516 };
4517 
4518 // TODO Jess: add doc comments
4519 
4520 class xfer_prop {
4521  friend class communicator;
4522 private:
4523 
4524  std::vector<tiramisu::xfer_attr> attrs;
4525 
4526  tiramisu::primitive_t dtype;
4527 
4528  int xfer_prop_id;
4529 
4530  xfer_prop();
4531 
4532 public:
4533 
4534  xfer_prop(tiramisu::primitive_t d_type, std::initializer_list<tiramisu::xfer_attr> attrs);
4535 
4536  xfer_prop(tiramisu::primitive_t d_type, std::initializer_list<tiramisu::xfer_attr> attrs,
4537  int xfer_prop_id);
4538 
4539  static std::set<int> xfer_prop_ids;
4540 
4541  static std::string attr_to_string(xfer_attr attr);
4542 
4543  tiramisu::primitive_t get_dtype() const;
4544 
4545  int get_xfer_prop_id() const;
4546 
4547  bool contains_attr(tiramisu::xfer_attr attr) const;
4548 
4549  bool contains_attrs(std::vector<tiramisu::xfer_attr> attrs) const;
4550 
4551  void add_attr(tiramisu::xfer_attr attr);
4552 
4553 };
4554 
4555 class communicator : public computation {
4556 private:
4557 
4558  std::vector<tiramisu::expr> dims;
4559 
4560 protected:
4561 
4563 
4564  communicator();
4565 
4566 public:
4567 
4568  communicator(std::string iteration_domain_str, tiramisu::expr rhs, bool schedule_this_computation,
4569  tiramisu::primitive_t data_type, tiramisu::function *fct);
4570 
4571  communicator(std::string iteration_domain_str, tiramisu::expr rhs,
4572  bool schedule_this_computation, tiramisu::primitive_t, tiramisu::xfer_prop prop,
4573  tiramisu::function *fct);
4574 
4575  xfer_prop get_xfer_props() const;
4576 
4577  tiramisu::expr get_num_elements() const;
4578 
4579  void add_dim(tiramisu::expr size);
4580 
4581  /**
4582  * Collapse a loop level.
4583  */
4584  std::vector<communicator *> collapse(int level, tiramisu::expr collapse_from_iter,
4585  tiramisu::expr collapse_until_iter, tiramisu::expr num_collapsed);
4586 
4587  /**
4588  * Collapse several consecutive loop levels (must collapse from innermost towards outermost).
4589  */
4590  void collapse_many(std::vector<collapse_group> collapse_each);
4591 
4592 };
4593 
4594 class send : public communicator {
4595 private:
4596 
4597  tiramisu::computation *producer = nullptr;
4598 
4599  recv *matching_recv = nullptr;
4600 
4601  tiramisu::expr msg_tag;
4602 
4603  tiramisu::expr src;
4604 
4605  tiramisu::expr dest;
4606 
4607  static int next_msg_tag;
4608 
4609 public:
4610  // TODO (Jess) is producer ever used?
4611  send(std::string iteration_domain_str, tiramisu::computation *producer, tiramisu::expr rhs,
4612  xfer_prop prop, bool schedule_this_computation, std::vector<expr> dims,
4613  tiramisu::function *fct);
4614 
4615  virtual bool is_send() const override;
4616 
4617  virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
4618  bool schedule_this_computation, tiramisu::primitive_t t,
4619  tiramisu::function *fct) override;
4620 
4621  tiramisu::computation *get_producer() const;
4622 
4623  tiramisu::recv *get_matching_recv() const;
4624 
4625  tiramisu::expr get_msg_tag() const;
4626 
4627  tiramisu::expr get_src() const;
4628 
4629  tiramisu::expr get_dest() const;
4630 
4631  void set_matching_recv(tiramisu::recv *matching_recv);
4632 
4633  void set_src(tiramisu::expr src);
4634 
4635  void set_dest(tiramisu::expr dest);
4636 
4637  void override_msg_tag(tiramisu::expr msg_tag);
4638 };
4639 
4640 class recv : public communicator {
4641 private:
4642 
4643  send *matching_send = nullptr;
4644 
4645  tiramisu::expr src;
4646 
4647  tiramisu::expr dest;
4648 
4649  tiramisu::expr msg_tag;
4650 
4651 public:
4652 
4653  recv(std::string iteration_domain_str, bool schedule_this, tiramisu::xfer_prop prop,
4654  tiramisu::function *fct);
4655 
4656  virtual bool is_recv() const override;
4657 
4658  virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
4659  bool schedule_this_computation, tiramisu::primitive_t t,
4660  tiramisu::function *fct) override;
4661 
4662  tiramisu::send *get_matching_send() const;
4663 
4664  tiramisu::expr get_msg_tag() const;
4665 
4666  tiramisu::expr get_src() const;
4667 
4668  tiramisu::expr get_dest() const;
4669 
4670  void set_matching_send(tiramisu::send *matching_send);
4671 
4672  void set_src(tiramisu::expr src);
4673 
4674  void set_dest(tiramisu::expr dest);
4675 
4676  void override_msg_tag(tiramisu::expr msg_tag);
4677 
4678 };
4679 
4680 class send_recv : public communicator {
4681 public:
4682 
4683  send_recv(std::string iteration_domain_str, tiramisu::computation *producer,
4684  tiramisu::expr rhs, xfer_prop prop, bool schedule_this_computation,
4685  std::vector<expr> dims, tiramisu::function *fct);
4686 
4687  virtual bool is_send_recv() const override;
4688 
4689 };
4690 
4691 class wait : public communicator {
4692 private:
4693 
4694  tiramisu::expr rhs;
4695 
4696 public:
4697 
4699 
4700  wait(std::string iteration_domain_str, tiramisu::expr rhs, xfer_prop prop, bool schedule_this,
4701  tiramisu::function *fct);
4702 
4703  virtual bool is_wait() const override;
4704 
4705  virtual void add_definitions(std::string iteration_domain_str, tiramisu::expr e,
4706  bool schedule_this_computation, tiramisu::primitive_t t,
4707  tiramisu::function *fct) override;
4708 
4709  std::vector<tiramisu::computation *> get_op_to_wait_on() const;
4710 
4711 };
4712 
4713 // Halide IR specific functions
4714 
4715 void halide_stmt_dump(Halide::Internal::Stmt s);
4716 
4717 Halide::Module lower_halide_pipeline(
4718  const std::string &pipeline_name,
4719  const Halide::Target &t,
4720  const std::vector<Halide::Argument> &args,
4721  const Halide::Internal::LoweredFunc::LinkageType linkage_type,
4722  Halide::Internal::Stmt s);
4723 
4724 int loop_level_into_dynamic_dimension(int level);
4725 int loop_level_into_static_dimension(int level);
4726 /**
4727  * TODO code cleaning:
4728  * - Go to the tutorials, add a small explanation about how Tiramisu should work in general.
4729  * - Add two pages explaining how one should use Tiramisu,
4730  *
4731  * - Have documentation on header files only,
4732  * - Order the functions in the class computations (get functions then update functions ordered in alphabetical order),
4733  * - Clean/document expr.h and type.h
4734  */
4735 
4736 }
4737 
4738 #endif
argument_t
Types of function arguments.
Definition: type.h:133
static function * get_implicit_function()
Return the implicit function created during Tiramisu initialization.
Definition: expr.h:80
std::vector< tiramisu::computation * > automatically_allocated
Keeps track of allocation computations created using allocate_and_map_buffer_automatically() to sched...
Definition: core.h:469
A class that represents computations.
Definition: core.h:1320
Halide::Module lower_halide_pipeline(const std::string &pipeline_name, const Halide::Target &t, const std::vector< Halide::Argument > &args, const Halide::Internal::LoweredFunc::LinkageType linkage_type, Halide::Internal::Stmt s)
std::map< std::string, tiramisu::computation * > computation_list
Definition: core.h:47
primitive_t
tiramisu data types.
Definition: type.h:27
tiramisu::primitive_t dtype
Data type.
Definition: expr.h:219
std::map< std::string, tiramisu::constant * > constant_list
Definition: core.h:48
int loop_level_into_static_dimension(int level)
void codegen(const std::vector< tiramisu::buffer * > &arguments, const std::string obj_filename, const bool gen_cuda_stmt=false)
Generate code.
static std::set< int > xfer_prop_ids
Definition: core.h:4539
Halide::Expr halide_expr_from_tiramisu_expr(const tiramisu::computation *comp, std::vector< isl_ast_expr * > &index_expr, const tiramisu::expr &tiramisu_expr)
Convert a Tiramisu expression into a Halide expression.
isl_ast_expr * wait_index_expr
An index expression just for the request buffer.
Definition: core.h:2284
expr substitute(std::vector< std::pair< var, expr >> substitutions) const
Returns a new expression where for every (var, sub) pair in substitutions, var in the original expres...
HalideCodegenOutput(const std::map< std::string, tiramisu::computation * > &computations, const std::map< std::string, tiramisu::constant * > &constants, const std::map< std::string, tiramisu::buffer * > &buffers)
Definition: core.h:51
Input(std::string name, std::vector< expr > sizes, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:4128
void dump(bool exhaustive) const
Dump the object on standard output (dump most of the fields of the expression class).
Definition: expr.h:1021
view(std::string name, std::vector< var > iterator_variables, primitive_t t)
Constructor for a view.
Definition: core.h:4173
input(std::string name, std::vector< var > iterator_variables, primitive_t t)
Constructor for an input.
Definition: core.h:4093
void init(std::string name)
Initialize the Tiramisu compiler and set Tiramisu options to default values.
std::string library_call_name
If the computation represents a library call, this is the name of the function.
Definition: core.h:2279
A class that represents buffers.
Definition: core.h:1017
void fuse_after(tiramisu::var lev, computation &comp)
Fuse this computation with the computation passed as argument in the same loop.
Definition: core.h:3230
isl_map * isl_map_add_free_var(const std::string &free_var_name, isl_map *map, isl_ctx *ctx)
A class that represents a synchronization object.
Definition: expr.h:1688
bool use_low_level_scheduling_commands
A boolean set to true if low level scheduling was used in the program.
Definition: core.h:736
std::vector< var > iterators_from_size_expressions(std::vector< expr > sizes)
Definition: core.h:4112
bool _is_library_call
True if this computation represents a library call.
Definition: core.h:2274
computation(std::string name, std::vector< var > iterator_variables, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:2676
int loop_level_into_dynamic_dimension(int level)
std::tuple< int, tiramisu::expr, tiramisu::expr, tiramisu::expr > collapse_group
Loop level, loop start, loop end, number of elements to collapse.
Definition: core.h:95
A class to represent tiramisu expressions.
Definition: expr.h:150
A class to represent functions in Tiramisu.
Definition: core.h:131
xfer_prop prop
Definition: core.h:4562
tiramisu::send_recv * sr
Definition: core.h:89
std::unordered_map< tiramisu::computation *, std::unordered_map< tiramisu::computation *, int > > sched_graph
Stores all high level scheduling instructions between computations; i.e.
Definition: core.h:659
tiramisu::computation * get_computation_with_whom_this_is_computed()
Get the computation with whom this constant is computed.
Definition: core.h:4266
std::unordered_map< tiramisu::computation *, std::unordered_map< tiramisu::computation *, int > > sched_graph_reversed
Same as sched_graph, except in reverse order (from after to before).
Definition: core.h:665
std::string generate_new_computation_name()
std::map< std::string, tiramisu::buffer * > output_buffers
Definition: core.h:49
A class that represents loop invariants.
Definition: core.h:4187
input(std::vector< var > iterator_variables, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:4103
op_t
Types of tiramisu operators.
Definition: type.h:53
tiramisu::recv * r
Definition: core.h:88
computation(std::vector< var > iterator_variables, primitive_t t)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: core.h:2686
xfer_attr
Definition: core.h:73
const std::string & get_name() const
Get the name of the ID or the variable represented by this expressions.
Definition: expr.h:702
void split_string(std::string str, std::string delimiter, std::vector< std::string > &vector)
int get_loop_level_number_from_dimension_name(std::string dim_name)
Search the time-space domain (the range of the schedule) and return the loop level number that corres...
Definition: core.h:3281
HalideCodegenOutput halide_pipeline_to_tiramisu_function(Halide::Internal::Stmt s, const std::vector< Halide::Internal::Function > &outputs, const std::map< std::string, Halide::Internal::Function > &env, const std::map< std::string, std::vector< int32_t >> &output_buffers_size, tiramisu::function *func)
Definition: core.h:27
std::unordered_set< tiramisu::computation * > starting_computations
The set of all computations that have no computation scheduled before them.
Definition: core.h:729
A class that represents constant variable references.
Definition: expr.h:1704
tiramisu::expr operator()(Args...args)
Access operator: C0(i,j) represents an access to the element (i,j) of the computation C0...
Definition: core.h:4005
A class containing utility functions.
Definition: core.h:4466
computation * get_computation_annotated_in_a_node(isl_ast_node *node)
void halide_stmt_dump(Halide::Internal::Stmt s)
static const var root
root_dimension is a number used to specify the dimension level known as root.
Definition: core.h:3990
tiramisu::send * s
Definition: core.h:87
A class for code generation.
Definition: core.h:4288