Threading Building Blocks Generic Parallel Algorithms parallelfor parallelreduce
Threading Building Blocks
Generic Parallel Algorithms parallel_for parallel_reduce parallel_scan parallel_do pipeline, parallel_sort parallel_invoke TBB 4. 0 Components Concurrent containers Synchronization primitives atomic mutex recursive_mutex spin_mutex, spin_rw_mutex queuing_mutex, queuing_rw_mutex concurrent_unordered_map, concurrent_unordered_set, concurrent_hash_map, concurrent_queue, concurrent_bounded_queue, concurrent_priority_queue concurrent_vector Memory allocation Raw tasking task_group task_list task_scheduler_observer 8 Flow Graph graph function_node broadcast_node … tbb_allocator cache_aligned_allocator scalable_allocator
Generic Parallel Algorithms parallel_for parallel_reduce parallel_scan parallel_do pipeline, parallel_sort parallel_invoke TBB 4. 0 Components Concurrent containers Synchronization primitives atomic mutex recursive_mutex spin_mutex, spin_rw_mutex queuing_mutex, queuing_rw_mutex concurrent_unordered_map, concurrent_unordered_set, concurrent_hash_map, concurrent_queue, concurrent_bounded_queue, concurrent_priority_queue concurrent_vector Memory allocation Raw tasking task_group task_list task_scheduler_observer 9 Flow Graph graph function_node broadcast_node … tbb_allocator cache_aligned_allocator scalable_allocator
Μέθοδος 1 task groups + Lambdas long Parallel. Fib(long n) { if ( n < 16 ) return Serial. Fib(n); else { int x, y; tbb: : task_group g; g. run( [&]{ x = Parallel. Fib(n-1); } ); g. run( [&]{ y = Parallel. Fib(n-2); } ); g. wait(); return x+y; } }
Μέθοδος 2 task objects long Serial. Fib(long n) { if (n < 2) return n; else return Serial. Fib(n-1) + Serial. Fib(n-2); } class Fib. Task: public task { const long n; long *const sum; Fib. Task(long n_, long* sum_) { n=n_; sum=sum_; } task* execute() { if (n < cut. Off) *sum = Serial. Fib(n); else { long x, y; Fib. Task& a = *new ( allocate_child())Fib. Task(n-1, &x); Fib. Task& b = *new ( allocate_child())Fib. Task(n-2, &y); long n, sum; set_ref_count(3); spawn(b); spawn(a); wait_for_all(); *sum = x+y; Fib. Task& r = *new ( allocate_root())Fib. Task(n, &sum); spawn_root_and_wait(r); } return NULL; cout << sum; } };
Μέθοδος 2 task objects class Fib. Task: public task { const long n; long *const sum; n_, long* sum_) { each user-defined. Fib. Task(long task must extend tbb: : taskn=n_; sum=sum_; and implement execute() } long Serial. Fib(long n) { if (n < 2) return n; else return Serial. Fib(n-1) + Serial. Fib(n-2); } task* execute() { if (n < cut. Off) *sum = Serial. Fib(n); else { long x, y; Fib. Task& a = *new ( allocate_child())Fib. Task(n-1, &x); Fib. Task& b = *new ( allocate_child())Fib. Task(n-2, &y); long n, sum; set_ref_count(3); spawn(b); spawn(a); wait_for_all(); *sum = x+y; Fib. Task& r = *new ( allocate_root())Fib. Task(n, &sum); spawn_root_and_wait(r); } return NULL; cout << sum; } };
Μέθοδος 2 task objects long Serial. Fib(long n) { if (n < 2) return n; else return Serial. Fib(n-1) + Serial. Fib(n-2); } class Fib. Task: public task { const long n; long *const sum; Fib. Task(long n_, long* sum_) { n=n_; sum=sum_; } task* execute() { if (n < cut. Off) *sum = Serial. Fib(n); else { long x, y; Fib. Task& a = *new ( allocate_child())Fib. Task(n-1, &x); Fib. Task& b = *new ( allocate_child())Fib. Task(n-2, &y); allocate root task (has no parent) long n, sum; set_ref_count(3); spawn(b); spawn(a); wait_for_all(); *sum = x+y; Fib. Task& r = *new ( allocate_root())Fib. Task(n, &sum); spawn_root_and_wait(r); } return NULL; cout << sum; spawn it, and wait here } };
Μέθοδος 2 task objects long Serial. Fib(long n) { if (n < 2) return n; if n small enough, else execute task serially return Serial. Fib(n-1) + Serial. Fib(n-2); } class Fib. Task: public task { const long n; long *const sum; Fib. Task(long n_, long* sum_) { n=n_; sum=sum_; } task* execute() { if (n < cut. Off) *sum = Serial. Fib(n); else { long x, y; Fib. Task& a = *new ( allocate_child())Fib. Task(n-1, &x); Fib. Task& b = *new ( allocate_child())Fib. Task(n-2, &y); otherwise create and run two tasks long n, sum; set_ref_count(3); spawn(b); spawn(a); wait_for_all(); *sum = x+y; Fib. Task& r = *new ( allocate_root())Fib. Task(n, &sum); spawn_root_and_wait(r); } return NULL; cout << sum; } };
Μέθοδος 2 task objects long Serial. Fib(long n) { if (n < 2) return n; else return Serial. Fib(n-1) + Serial. Fib(n-2); allocate child } class Fib. Task: public task { const long n; long *const sum; Fib. Task(long n_, long* sum_) { n=n_; sum=sum_; } task* execute() { if (n < cut. Off) *sum = Serial. Fib(n); else { long x, y; Fib. Task& a = *new ( allocate_child())Fib. Task(n-1, &x); Fib. Task& b = *new ( allocate_child())Fib. Task(n-2, &y); tasks long n, sum; spawn tasks set_ref_count(3); spawn(b); spawn(a); wait_for_all(); *sum = x+y; Fib. Task& r = *new ( (indicate them as allocate_root())Fib. Task(n, &sum); “ready to execute”) spawn_root_and_wait(r); } return NULL; cout << sum; } }; merge their results and store into *sum how many children should I wait for? 2 (+1 implicit. . . ) ok, now really wait for children to complete
Chunking και loop partitioners parallel_for( blocked_range<size_t>(0, n, G), [](){…}, , some_partitioner()) § Chunking: το μέγεθος των ranges στο οποίο σταματά η αναδρομική διάσπαση – § optional argument στον constructor του blocked_range Partitioners – 1. optional argument στην parallel_for simple_partitioner • recursive binary splitting, εγγυάται ότι � G/2�≤ chunksize ≤ G 2. affinity_partitioner • αναθέτει τα ranges με τρόπο ώστε να μεγιστοποιείται το cache locality 3. auto_partitioner (default) • επιλέγει αυτόματα το grainsize με βάση ευριστική μέθοδο • προσπαθεί να ελαχιστοποιήσει το range splitting σε σημείο που να εξασφαλίζεται καλό load balancing
Λειτουργία parallel_for 0 N A A P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 N A A P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 N N/2 A N B B A P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 0 N N/2 C N/2 A N B B C A P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 0 N N/2 C N/2 A N B B C P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 0 N N/2 C A B B D D C P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 N N/2 C 0 A B B D 0 N/4 E E D C P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 N N/2 C 0 A B B D 0 N/4 E E D C P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 N N/2 C 0 A B D 0 N/4 E D C E B P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 N/2 C 0 0 N/4 N E A B D E D B P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 0 - - N/2 C N A B D 0 - N/4 E D E F F B P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 0 - - N/2 C N A B D F 0 - N/4 E D E 0 N/8 G F G B P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 0 - - N/2 C N A B D F 0 - N/4 E D E 0 N/8 G F G B P 0 P 1 P 2 P 3
Λειτουργία parallel_for 0 0 - - N/2 C N A B F 0 - N/4 E D E 0 N/8 G F G B D P 0 P 1 P 2 P 3
Generic task graph task groups S(); task_group g; g. run( [&]{ C(); E(); } ); g. run( [&]{ task_group g 1; g 1. run( [&]{A(); } ); g 1. run( [&]{B(); } ); g 1. wait(); D(); }); g. wait(); F();
Generic task graph flow graph g; broadcast_node <continue_msg > s; continue_node <continue_msg continue_node <continue_msg make_edge(s, a); make_edge(s, b); make_edge(s, c); make_edge(a, d); make_edge(b, d); make_edge(c, e); make_edge(d, f); make_edge(e, f); > > > a(g, A()); b(g, B()); c(g, C()); d(g, D()); e(g, E()); f(g, F()); S(); s. try_put(continue_msg()); //fire! g. wait_for_all();
Generic task graph task objects + reference counts 0 1 1 1 2 2 2 2 1 2 2 2 class Mesh. Task: public task { public: const int i, j; //coordinates Mesh. Task *south, *east; task* execute() { double north = (i==0) ? 0 : A[i-1][j]; double west = (j==0) ? 0 : A[i][j-1]; 2 A[i][j] = do_work(north, west); //if there is south neighbor if(south!=NULL) if (!south->decrement_ref_count()) spawn(*south); Mesh. Task* Mesh[4][5]; //for all tasks in Mesh: // allocate // initialize south, east pointers // set reference counters //if there is east neighbor if(east!=NULL) if (!east->decrement_ref_count()) spawn(*south); //wait for all but last task to complete Mesh[3][4]->spawn_and_wait_for_all(*Mesh[0][0]); //execute last task Mesh[3][4]->execute(); return NULL; } }
Resources § Home – § Latest stable release (4. 0): – – § https: //threadingbuildingblocks. org/documentation. php Getting Started Tutorial Reference Intel Software Network blogs: – § https: //threadingbuildingblocks. org/file. php? fid=77 use sources Documentation: – – § http: //threadingbuildingblocks. org/ http: //software. intel. com/en-us/blogs/tag/tbb/ Forum: – http: //software. intel. com/en-us/forums/intel-threading-buildingblocks/
Extra slides
Lambda Expressions § § “C++11 feels like a new language” [B. Stroustrup] Δυνατότητα “in-place” ορισμού συναρτήσεων στο σημείο που χρησιμοποιούνται – – αντί των function objects o compiler δημιουργεί μοναδικό, ανώνυμο function object για κάθε lambda expression char s[]="Hello World!"; int nup = 0; //modified by the lambda for_each( s, s+sizeof(s), [&nup] (char c) { if (isupper(c)) nup++; } ); cout << nup << " uppercase letters in: "<< s << endl; § gcc 4. 5 or newer
Lambda Syntax § [capture_mode] (formal_parameters) -> return_type {body} [&] by-reference [=] by-value [] no capture Can omit if there are no parameters and return type is implicit. Can omit if return type is void or code is “return expr; ” Examples [&](float x) {sum+=x; } []{return rand(); } [&]{return *p++; } [](float x, float y)->float { if(x<y) return x; else return y; } [=](float x) {return a*x+b; } 49
Γενικευμένος Προγραμματισμός § «. . . deals with finding abstract representations of efficient algorithms, data structures, and other software concepts, and with their systematic organization» [Czarnecki, Eisenecker – Generative Programming] § «. . . a style of computer programming in which algorithms are written in terms of to-be-specified-later types that are then instantiated when needed for specific types provided as parameters» [wikipedia] § Σκοπός η ανάπυξη λογισμικού ώστε να είναι επαναχρησιμοποιήσιμο με απλό και αποδοτικό τρόπο
Templates Επιτρέπουν την παραμετροποίηση τύπων σε συναρτήσεις και κλάσεις § Παράδειγμα templated συνάρτησης § template<typename T> void swap(T & x, T & y) { T tmp = x; x = y; y = tmp; } … float f 1, f 2; String s 1, s 2; swap(f 1, f 2); swap(s 1, s 2); Ελάχιστες απαιτήσεις για τον Τ 1. copy constructor T(const T&) 2. assignment operator void T: : operator=(const T&); 3. destructor ~T() //template instantiation: swap floats //template instantiation: swap strings
Templates § Παράδειγμα templated κλάσης template<typename T, typename U> class pair { public: T first; U second; pair( const T & x, const U & y ) : first(x), second(y) {} }; … //compiler instantiates pair with T=string and U=int pair<string, int> x; x. first = “abc”; x. second = 42;
Παράδειγμα: πλήθος εμφανίσεων μιας τιμής σε ένα vector<int> v; vector<int>: : iterator b = v. begin(), e = v. end(); long c = count( b, e, 4); template<class Iter, class T> long count(Iter first, Iter last, const T& value) { long ret=0; while ( first != last ) if (*first++ == value) ++ret; return ret; } § γενικευμένη υλοποίηση χωρίς να υστερεί σε απόδοση
Παράδειγμα functor template<typename I, typename Functor> void For. Each( I lower, I upper, const Functor& f ) { for ( I i=lower; i<upper; ++i ) f(i); } Template function for iteration class Accumulate { Functor float& acc; float* src; public: Accumulate(float& acc_, float* src_) : acc(acc_), src(src_) (0) {} void operator()( int i ) { acc += src[i]; } }; float a[4] = {1, 3, 9, 27}; float sum = 0. 0; Accumulate A(sum, a); For. Each( 0, 4, A ); cout << sum; 56 Pass functor to template function.
Δήλωση parallel_for template <typename Range, typename Body> void parallel_for(const Range& R, const Body& B ); § § Απαιτήσεις για το Body B B: : B( const F& ) Copy constructor B: : ~B() Destructor void B: : operator() (Range& subrange) const Apply B to subrange Απαιτήσεις για το Range R R(const R&) Copy a range R: : ~R() Destroy a range bool R: : empty() const Is range empty? bool R: : is_divisible() const Can range be split? R: : R (R& r, split) Split r into two subranges η βιβλιοθήκη παρέχει τις κλάσεις blocked_range, blocked_range 2 d, blocked_range 3 d
- Slides: 61