Map Reduce design patterns Chapter 5 Join Patterns

  • Slides: 57
Download presentation
Map. Reduce design patterns Chapter 5: Join Patterns 2015. 6. 4 G 201449021 진다인

Map. Reduce design patterns Chapter 5: Join Patterns 2015. 6. 4 G 201449021 진다인

I. Introduction II. A Refresher on Joins III. Reduce Side Join Contents IV. Replicated

I. Introduction II. A Refresher on Joins III. Reduce Side Join Contents IV. Replicated Join V. Composite Join VI. Cartesian Product 2015 -06 -04 1

A Refresher on Joins Inner Join + 2015 -06 -04 5 / 56

A Refresher on Joins Inner Join + 2015 -06 -04 5 / 56

A Refresher on Joins Left Outer Join + 2015 -06 -04 6 / 56

A Refresher on Joins Left Outer Join + 2015 -06 -04 6 / 56

A Refresher on Joins Right Outer Join + 2015 -06 -04 7 / 56

A Refresher on Joins Right Outer Join + 2015 -06 -04 7 / 56

A Refresher on Joins Full Outer Join 2015 -06 -04 8 / 56

A Refresher on Joins Full Outer Join 2015 -06 -04 8 / 56

A Refresher on Joins Antijoin 2015 -06 -04 9 / 56

A Refresher on Joins Antijoin 2015 -06 -04 9 / 56

A Refresher on Joins Cartesian Product 2015 -06 -04 10 / 56

A Refresher on Joins Cartesian Product 2015 -06 -04 10 / 56

A Refresher on Joins Cartesian Product 2015 -06 -04 11 / 56

A Refresher on Joins Cartesian Product 2015 -06 -04 11 / 56

Reduce Side Join Structure 2015 -06 -04 14 / 56

Reduce Side Join Structure 2015 -06 -04 14 / 56

Reduce Side Join Example • Stackoverflow의 user/comment 데이터 셋 사용 • 문제: 사용자 정보와

Reduce Side Join Example • Stackoverflow의 user/comment 데이터 셋 사용 • 문제: 사용자 정보와 코멘트 데이터가 주어졌을 때, 둘을 조인시켜 누가 어떤 코멘트를 작성했는지 알 수 있도록 하라. <row Account. Id="-1" Age="1" Down. Votes="1080" Up. Votes="12995" Views="827" About. Me="<p>Hi, I'm not really a person. </p> <p>I'm a background process that helps keep this site clean!</p> <p>I do things like</p> <ul> <li>Randomly poke old unanswered questions every hour so they get some attention</li> <li>Own community questions and answers so nobody gets unnecessary reputation from them</li> <li>Own downvotes on spam/evil posts that get permanently deleted</li> <li>Own suggested edits from anonymous users</li> <li><a href="http: //meta. stackexchange. com/a/92006">Remove abandoned questions</a></li> </ul> " Location="on the server farm" Last. Access. Date="2014 -04 -17 T 00: 17: 22. 260" Display. Name="Community" Creation. Date="2014 -0417 T 00: 17: 22. 260" Reputation="1" Id="-1"/> User data <row User. Id="922184" Creation. Date="2014 -04 -17 T 00: 49: 51. 207" Text="Looks like they arbitrarily choose 250000 as the post id cutoff. (1 st comment on the new Meta. SO!!! AHAHA)" Score="27" Post. Id="250001" Id="1"/> Comment data 2015 -06 -04 16 / 56

Reduce Side Join Example • Mapper #1: User. Join. Mapper public static class User.

Reduce Side Join Example • Mapper #1: User. Join. Mapper public static class User. Join. Mapper extends Mapper<Object, Text, Text> { private Text outkey = new Text(); private Text outvalue = new Text(); @Override public void map(Object key, Text value, Context context) throws IOException, Interrupted. Exception { // Parse the input string into a nice map Map<String, String> parsed = MRDPUtils. transform. Xml. To. Map(value. to. String()); String user. Id = parsed. get("User. Id"); if (user. Id == null) { return; } // The foreign join key is the user ID outkey. set(user. Id); // Flag this record for the reducer and then output outvalue. set("A" + value. to. String()); context. write(outkey, outvalue); } } 2015 -06 -04 17 / 56

Reduce Side Join Example • Mapper #2: Comment. Join. Mapper public static class Comment.

Reduce Side Join Example • Mapper #2: Comment. Join. Mapper public static class Comment. Join. Mapper extends Mapper<Object, Text, Text> { private Text outkey = new Text(); private Text outvalue = new Text(); @Override public void map(Object key, Text value, Context context) throws IOException, Interrupted. Exception { // Parse the input string into a nice map Map<String, String> parsed = MRDPUtils. transform. Xml. To. Map(value. to. String()); String user. Id = parsed. get("User. Id"); if (user. Id == null) { return; } // The foreign join key is the user ID outkey. set(user. Id); // Flag this record for the reducer and then output outvalue. set("B" + value. to. String()); context. write(outkey, outvalue); } } 2015 -06 -04 18 / 56

Reduce Side Join Example • Reducer public void reduce(Text key, Iterable<Text> values, Context context)

Reduce Side Join Example • Reducer public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, Interrupted. Exception { // Clear our lists list. A. clear(); list. B. clear(); // iterate through all our values, binning each record based on what // it was tagged with // make sure to remove the tag! for (Text t : values) { if (t. char. At(0) == 'A') { list. A. add(new Text(t. to. String(). substring(1))); } else if (t. char. At('0') == 'B') { list. B. add(new Text(t. to. String(). substring(1))); } } // Execute our join logic now that the lists are filled execute. Join. Logic(context); } 2015 -06 -04 19 / 56

Reduce Side Join Example • Reducer private void execute. Join. Logic(Context context) throws IOException,

Reduce Side Join Example • Reducer private void execute. Join. Logic(Context context) throws IOException, Interrupted. Exception { if (join. Type. equals. Ignore. Case("inner")) { // If both lists are not empty, join A with B if (!list. A. is. Empty() && !list. B. is. Empty()) { for (Text A : list. A) { for (Text B : list. B) { context. write(A, B); } } Inner join else if (join. Type. equals. Ignore. Case("leftouter")) { // For each entry in A, for (Text A : list. A) { // If list B is not empty, join A and B if (!list. B. is. Empty()) { for (Text B : list. B) { context. write(A, B); } } else { // Else, output A by itself context. write(A, new Text("")); } } 2015 -06 -04 Left outer join 20 / 56

Reduce Side Join Example • Reducer else if (join. Type. equals. Ignore. Case("rightouter")) {

Reduce Side Join Example • Reducer else if (join. Type. equals. Ignore. Case("rightouter")) { // FOr each entry in B, for (Text B : list. B) { // If list A is not empty, join A and B if (!list. A. is. Empty()) { for (Text A : list. A) { context. write(A, B); } else { // Else, output B by itself context. write(new Text(""), B); } 2015 -06 -04 Right outer join 21 / 56

Reduce Side Join Example • Reducer else if (join. Type. equals. Ignore. Case("fullouter")) {

Reduce Side Join Example • Reducer else if (join. Type. equals. Ignore. Case("fullouter")) { // If list A is not empty if (!list. A. is. Empty()) { // For each entry in A for (Text A : list. A) { // If list B is not empty, join A with B if (!list. B. is. Empty()) { for (Text B : list. B) { context. write(A, B); } else { // Else, output A by itself context. write(A, new Text("")); } } else { // If list A is empty, just output B for (Text B : list. B) { context. write(new Text(""), B); } 2015 -06 -04 Full outer join 22 / 56

Reduce Side Join Example • Reducer else if (join. Type. equals. Ignore. Case("anti")) {

Reduce Side Join Example • Reducer else if (join. Type. equals. Ignore. Case("anti")) { // If list A is empty and B is empty or vice versa if (list. A. is. Empty() ^ list. B. is. Empty()) { // Iterate both A and B with null values // The previous XOR check will make sure exactly one of // these lists is empty and therefore won't have output for (Text A : list. A) { context. write(A, new Text("")); } for (Text B : list. B) { context. write(new Text(""), B); } } else { throw new Runtime. Exception( "Join type not set to inner, leftouter, rightouter, fullouter, or anti"); } } antijoin 2015 -06 -04 23 / 56

Reduce Side Join Example • Driver Main Configuration conf = new Configuration(); String[] other.

Reduce Side Join Example • Driver Main Configuration conf = new Configuration(); String[] other. Args = new Generic. Options. Parser(conf, args) . get. Remaining. Args(); if (other. Args. length != 4) { System. err. println("Usage: Reduce. Side. Join <user data> <comment data> <out> [inner|leftouter|rightouter|fullouter|anti]"); System. exit(1); } String join. Type = other. Args[3]; if (!(join. Type. equals. Ignore. Case("inner") || join. Type. equals. Ignore. Case("leftouter") || join. Type. equals. Ignore. Case("rightouter") || join. Type. equals. Ignore. Case("fullouter") || join. Type. equals. Ignore. Case("anti"))) { System. err . println("Join type not set to inner, leftouter, rightouter, fullouter, or anti"); System. exit(2); } Job = new Job(conf, "Reduce Side Join"); // Configure the join type job. get. Configuration(). set("join. type", join. Type); job. set. Jar. By. Class(Reduce. Side. Join. Driver. class ); 2015 -06 -04 24 / 56

Reduce Side Join Example • Driver Main // Use multiple inputs to set which

Reduce Side Join Example • Driver Main // Use multiple inputs to set which input uses what mapper // This will keep parsing of each data set separate from a logical // standpoint // However, this version of Hadoop has not upgraded Multiple. Inputs // to the mapreduce package, so we have to use the deprecated API. // Future releases have this in the "mapreduce" package. Multiple. Inputs. add. Input. Path(job, new Path(other. Args[0]), Text. Input. Format. class, User. Join. Mapper. class); Multiple. Inputs. add. Input. Path(job, new Path(other. Args[1]), Text. Input. Format. class, Comment. Join. Mapper. class); job. set. Reducer. Class(User. Join. Reducer. class); File. Output. Format. set. Output. Path(job, new Path(other. Args[2])); job. set. Output. Key. Class(Text. class); job. set. Output. Value. Class(Text. class); System. exit(job. wait. For. Completion(true) ? 0 : 3); 2015 -06 -04 25 / 56

Reduce Side Join Result 2015 -06 -04 26 / 56

Reduce Side Join Result 2015 -06 -04 26 / 56

Replicated Join Structure • Mapper: 분산 캐시(Distributed cache)에서 모든 파일을 읽어와 메모리 에 있는

Replicated Join Structure • Mapper: 분산 캐시(Distributed cache)에서 모든 파일을 읽어와 메모리 에 있는 Look-up table에 저장한다. • Combiner, Partitioner, Reducer 없 음: Map-only 2015 -06 -04 29 / 56

Replicated Join Example: Replicated user comment • Mapper code public static class Replicated. Join.

Replicated Join Example: Replicated user comment • Mapper code public static class Replicated. Join. Mapper extends Mapper<Object, Text, Text> { private Hash. Map<String, String> user. Id. To. Info = new Hash. Map<String, String>(); private Text outvalue = new Text(); private String join. Type = null; @Override public void setup(Context context) throws IOException, Interrupted. Exception { try { Path[] files = Distributed. Cache. get. Local. Cache. Files(context . get. Configuration()); if (files == null || files. length == 0) { throw new Runtime. Exception("User information is not set in Distributed. Cache"); } // Read all files in the Distributed. Cache for (Path p : files) { Buffered. Reader rdr = new Buffered. Reader( new Input. Stream. Reader( new GZIPInput. Stream( new File(p. to. String()))))); 2015 -06 -04 31 / 56

Replicated Join Example: Replicated user comment • Mapper code String line; // For each

Replicated Join Example: Replicated user comment • Mapper code String line; // For each record in the user file while ((line = rdr. read. Line()) != null) { // Get the user ID for this record Map<String, String> parsed = MRDPUtils. transform. Xml. To. Map(line); String user. Id = parsed. get("Id"); if (user. Id != null) { // Map the user ID to the record user. Id. To. Info. put(user. Id, line); } } } catch (IOException e) { throw new Runtime. Exception(e); } // Get the join type join. Type = context. get. Configuration(). get("join. type"); } 2015 -06 -04 32 / 56

Replicated Join Example: Replicated user comment • Mapper code public void map(Object key, Text

Replicated Join Example: Replicated user comment • Mapper code public void map(Object key, Text value, Context context) throws IOException, Interrupted. Exception { // Parse the input string into a nice map Map<String, String> parsed = MRDPUtils. transform. Xml. To. Map(value. to. String()); String user. Id = parsed. get("User. Id"); if (user. Id == null) { return; } String user. Information = user. Id. To. Info. get(user. Id); // If the user information is not null, then output if (user. Information != null) { outvalue. set(user. Information); Context. write(value, outvalue); } else if (join. Type. equals. Ignore. Case("leftouter")) { // If we are doing a left outer join, output the record with an // empty value Context. write(value, new Text("")); } } } 2015 -06 -04 33 / 56

Replicated Join Example: Replicated user comment • Driver code public static void main(String[] args)

Replicated Join Example: Replicated user comment • Driver code public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] other. Args = new Generic. Options. Parser(conf, args). get. Remaining. Args(); if (other. Args. length != 4) { System. err . println("Usage: Replicated. Join <user data> <comment data> <out> [inner|leftouter]"); System. exit(1); } String join. Type = other. Args[3]; if (!(join. Type. equals. Ignore. Case("inner") || join. Type . equals. Ignore. Case("leftouter"))) { System. err. println("Join type not set to inner or leftouter"); System. exit(2); } // Configure the join type Job job = new Job(conf, "Replicated Join"); job. get. Configuration(). set("join. type", join. Type); job. set. Jar. By. Class(Replicated. Join. Driver. class ); 2015 -06 -04 34 / 56

Replicated Join Example: Replicated user comment • Driver code job. set. Mapper. Class(Replicated. Join.

Replicated Join Example: Replicated user comment • Driver code job. set. Mapper. Class(Replicated. Join. Mapper. class ); job. set. Num. Reduce. Tasks(0); Text. Input. Format. set. Input. Paths(job, new Path(other. Args[1])); Text. Output. Format. set. Output. Path(job, new Path(other. Args[2])); job. set. Output. Key. Class(Text. class ); job. set. Output. Value. Class(Text. class ); // Configure the Distributed. Cache. add. Cache. File(new Path(other. Args[0]). to. Uri(), job. get. Configuration()); Distributed. Cache. set. Local. Files(job. get. Configuration(), other. Args[0]); System. exit(job. wait. For. Completion(true ) ? 0 : 3); } 2015 -06 -04 35 / 56

Composite Join Example • Driver code public static void main(String[] args) throws Exception {

Composite Join Example • Driver code public static void main(String[] args) throws Exception { Job. Conf conf = new Job. Conf("Composite. Join"); conf. set. Jar. By. Class(Composite. Join. Driver. class); String[] other. Args = new Generic. Options. Parser(conf, args). get. Remaining. Args(); if (other. Args. length != 4) { System. err. println("Usage: Composite. Join <user data> <comment data> <out> [inner|outer]"); System. exit(1); } Path user. Path = new Path(other. Args[0]); Path comment. Path = new Path(other. Args[1]); Path output. Dir = new Path(other. Args[2]); String join. Type = other. Args[3]; if (!(join. Type. equals. Ignore. Case("inner") || join. Type. equals. Ignore. Case("outer"))) { System. err. println("Join type not set to inner or outer"); System. exit(2); } 2015 -06 -04 40 / 56

Composite Join Example • Driver code (Cont’d. ) conf. set. Mapper. Class(Composite. Mapper. class

Composite Join Example • Driver code (Cont’d. ) conf. set. Mapper. Class(Composite. Mapper. class ); conf. set. Num. Reduce. Tasks(0); // Set the input format class to a Composite. Input. Format class. // The Composite. Input. Format will parse all of our input files and output // records to our mapper. conf. set. Input. Format(Composite. Input. Format. class ); // The composite input format join expression will set how the records // are going to be read in, and in what input format. conf. set("mapred. join. expr", Composite. Input. Format. compose(join. Type, Key. Value. Text. Input. Format. class , user. Path, comment. Path)); Text. Output. Format. set. Output. Path(conf, output. Dir); conf. set. Output. Key. Class(Text. class ); conf. set. Output. Value. Class(Text. class ); Running. Job job = Job. Client. run. Job(conf); while ( !job. is. Complete()) { Thread. sleep(1000); } System. exit(job. is. Successful() ? 0 : 2); 2015 -06 -04 41 / 56

Composite Join Example • Mapper code public static class Composite. Mapper extends Map. Reduce.

Composite Join Example • Mapper code public static class Composite. Mapper extends Map. Reduce. Base implements Mapper<Text, Tuple. Writable, Text> { @Override public void map(Text key, Tuple. Writable value, Output. Collector<Text, Text> output, Reporter reporter) throws IOException { // Get the first two elements in the tuple and output them output. collect((Text) value. get(0), (Text) value. get(1)); } } 2015 -06 -04 42 / 56

Cartesian Product Performance analysis 2015 -06 -04 44 / 56

Cartesian Product Performance analysis 2015 -06 -04 44 / 56

Cartesian Product Example • Input formatter public static class Cartesian. Input. Format extends File.

Cartesian Product Example • Input formatter public static class Cartesian. Input. Format extends File. Input. Format { public static final String LEFT_INPUT_FORMAT = "cart. left. inputformat"; public static final String LEFT_INPUT_PATH = "cart. left. path"; public static final String RIGHT_INPUT_FORMAT = "cart. right. inputformat"; public static final String RIGHT_INPUT_PATH = "cart. right. path"; public static void set. Left. Input. Info(Job. Conf conf, Class<? extends File. Input. Format> input. Format, String input. Path) { conf. set(LEFT_INPUT_FORMAT, input. Format. get. Canonical. Name()); conf. set(LEFT_INPUT_PATH, input. Path); } public static void set. Right. Input. Info(Job. Conf job, Class<? extends File. Input. Format> input. Format, String input. Path) { job. set(RIGHT_INPUT_FORMAT, input. Format. get. Canonical. Name()); job. set(RIGHT_INPUT_PATH, input. Path); } 2015 -06 -04 47 / 56

Cartesian Product Example • Input formatter @Override public Input. Split[] get. Splits(Job. Conf conf,

Cartesian Product Example • Input formatter @Override public Input. Split[] get. Splits(Job. Conf conf, int num. Splits) throws IOException { try { // Get the input splits from both the left and right data sets Input. Split[] left. Splits = get. Input. Splits(conf, conf. get(LEFT_INPUT_FORMAT), conf. get(LEFT_INPUT_PATH), num. Splits); Input. Split[] right. Splits = get. Input. Splits(conf, conf. get(RIGHT_INPUT_FORMAT), conf. get(RIGHT_INPUT_PATH), num. Splits); // Create our Composite. Input. Splits, size equal to left. length * // right. length Composite. Input. Split[] return. Splits = new Composite. Input. Split[left. Splits. length * right. Splits. length]; int i = 0; // For each of the left input splits get. Splits for (Input. Split left : left. Splits) { // For each of the right input splits : 좌/우 데이터 셋에서 input split을 가져 for (Input. Split right : right. Splits) { // Create a new composite input split composing of the two 와 (좌측 split의 길이 * 우측 split의 길이) return. Splits[i] = new Composite. Input. Split(2); 만큼 split 생성 후 리턴 return. Splits[i]. add(left); return. Splits[i]. add(right); ++i; } } // Return the composite splits LOG. info("Total splits to process: " + return. Splits. length); return. Splits ; } 2015 -06 -04 48 / 56

Cartesian Product Example • Input formatter get. Record. Reader : Cartesian record reader의 새

Cartesian Product Example • Input formatter get. Record. Reader : Cartesian record reader의 새 인스턴스 생 성 및 반환 @Override public Record. Reader get. Record. Reader(Input. Split split, Job. Conf conf, Reporter reporter) throws IOException { // create a new instance of the Cartesian record reader return new Cartesian. Record. Reader((Composite. Input. Split) split, conf, reporter); } private Input. Split[] get. Input. Splits(Job. Conf conf, String input. Format. Class, String input. Path, int num. Splits) throws Class. Not. Found. Exception, IOException { // Create a new instance of the input format File. Input. Format input. Format = (File. Input. Format) Reflection. Utils . new. Instance(Class. for. Name(input. Format. Class), conf); // Set the input path for the left data set input. Format. set. Input. Paths(conf, input. Path); // Get the left input splits return input. Format. get. Splits(conf, num. Splits); } 2015 -06 -04 get. Input. Splits : 두 데이터 셋을 입력받아 데카르트 곱을 생성 한 후 input. Split의 list로 리턴 49 / 56

Cartesian Product Example • Record reader public static class Cartesian. Record. Reader<K 1, V

Cartesian Product Example • Record reader public static class Cartesian. Record. Reader<K 1, V 1, K 2, V 2> implements Record. Reader<Text, Text> { // Record readers to get key value pairs private Record. Reader left. RR = null, right. RR = null; // Store configuration to re-create the right record reader private File. Input. Format right. FIF; private Job. Conf right. Conf; private Input. Split right. IS; private Reporter right. Reporter; // Helper variables private K 1 lkey; private V 1 lvalue; private K 2 rkey; private V 2 rvalue; private boolean go. To. Next. Left = true, alldone = false; 2015 -06 -04 50 / 56

Cartesian Product Example • Record reader public Cartesian. Record. Reader(Composite. Input. Split split, Job.

Cartesian Product Example • Record reader public Cartesian. Record. Reader(Composite. Input. Split split, Job. Conf conf, Reporter reporter) throws IOException { this. right. Conf = conf; this. right. IS = split. get(1); this. right. Reporter = reporter; try { // Create left record reader File. Input. Format left. FIF = (File. Input. Format) Reflection. Utils . new. Instance(Class. for. Name(conf. get(Cartesian. Input. Format. LEFT_INPUT_FORMAT)), conf); left. RR = left. FIF. get. Record. Reader(split. get(0), conf, reporter); // Create right record reader right. FIF = (File. Input. Format) Reflection. Utils . new. Instance(Class. for. Name(conf. get(Cartesian. Input. Format. RIGHT_INPUT_FORMAT)), conf); right. RR = right. FIF. get. Record. Reader(right. IS, right. Conf, right. Reporter); } catch (Class. Not. Found. Exception e) { e. print. Stack. Trace(); throw new IOException(e); } // Create key value pairs for parsing 생성자 lkey = (K 1) this. left. RR. create. Key(); lvalue = (V 1) this. left. RR. create. Value(); : 좌/우 레코드를 읽기 위한 reader 오브젝트를 rkey = (K 2) this. right. RR. create. Key(); 생성하고 이 reader를 바탕으로 좌/우 데이터 rvalue = (V 2) this. right. RR. create. Value(); 셋 입력 split의 Key/Value를 생성한다. } 2015 -06 -04 51 / 56

Cartesian Product next Example • 1. 첫 번째 호출에 왼쪽 데이터 셋에서 첫 번째

Cartesian Product next Example • 1. 첫 번째 호출에 왼쪽 데이터 셋에서 첫 번째 레 코드를 읽어 mapper 입력 키 생성 2. 이어지는 호출들에서 우측 레코드 리더를 통해 mapper에서 더 이상 처리할 레코드가 없다고 통보할 때까지 계속 레코드를 읽음 3. 우측 레코드 리더가 한 번 끝나면 초기화하고 왼 쪽 데이터 셋의 다음 레코드에 대해 같은 처리 반복. Record reader @Override public boolean next(Text key, Text value) throws IOException { do { // If we are to go to the next left key/value pair if (go. To. Next. Left) { // Read the next key value pair, false means no more pairs if (!left. RR. next(lkey, lvalue)) { // If no more, then this task is nearly finished alldone = true; break; } else { // If we aren't done, set the value to the key and set our flags key. set(lvalue. to. String()); go. To. Next. Left = alldone = false; // Reset the right record reader this. right. RR = this. right. FIF. get. Record. Reader( this. right. IS, this. right. Conf, this. right. Reporter); } } (Continuing) 2015 -06 -04 52 / 56

Cartesian Product Example • Record reader (Continued) // Read the next key value pair

Cartesian Product Example • Record reader (Continued) // Read the next key value pair from the right data set if (right. RR. next(rkey, rvalue)) {// If success, set the value value. set(rvalue. to. String()); } else { // Otherwise, this right data set is complete // and we should go to the next left pair go. To. Next. Left = true; } // This loop will continue if we finished reading key/value // pairs from the right data set } while (go. To. Next. Left); // Return true if a key/value pair was read, false otherwise return !alldone; } 2015 -06 -04 53 / 56

Cartesian Product Example • Driver code public static void main(String[] args) throws IOException, Interrupted.

Cartesian Product Example • Driver code public static void main(String[] args) throws IOException, Interrupted. Exception, Class. Not. Found. Exception { long start = System. current. Time. Millis(); Job. Conf conf = new Job. Conf("Cartesian Product"); String[] other. Args = new Generic. Options. Parser(conf, args). get. Remaining. Args(); if (other. Args. length != 2) { System. err. println("Usage: Cartesian. Product <comment data> <out>"); System. exit(1); } // Configure the join type conf. set. Jar. By. Class(Cartesian. Product. class); conf. set. Mapper. Class(Cartesian. Mapper. class); conf. set. Num. Reduce. Tasks(0); conf. set. Input. Format(Cartesian. Input. Format. class); Cartesian. Input. Format. set. Left. Input. Info(conf, Text. Input. Format. class, other. Args[0]); Cartesian. Input. Format. set. Right. Input. Info(conf, Text. Input. Format. class, other. Args[0]); Text. Output. Format. set. Output. Path(conf, new Path(other. Args[1])); conf. set. Output. Key. Class(Text. class); conf. set. Output. Value. Class(Text. class); Running. Job job = Job. Client. run. Job(conf); while (!job. is. Complete()) { Thread. sleep(1000); } long finish = System. current. Time. Millis(); System. out. println("Time in ms: " + (finish - start)); System. exit(job. is. Successful() ? 0 : 2); } 2015 -06 -04 54 / 56

Cartesian Product Example • Mapper code public static class Cartesian. Mapper extends Map. Reduce.

Cartesian Product Example • Mapper code public static class Cartesian. Mapper extends Map. Reduce. Base implements Mapper<Text, Text> { private Text outkey = new Text(); @Override public void map(Text key, Text value, Output. Collector<Text, Text> output, Reporter reporter) throws IOException { // If the two comments are not equal if (!key. to. String(). equals(value. to. String())) { String[] left. Tokens = key. to. String(). split("\s"); String[] right. Tokens = value. to. String(). split("\s"); Hash. Set<String> left. Set = new Hash. Set<String>(Arrays. as. List(left. Tokens)); Hash. Set<String> right. Set = new Hash. Set<String>(Arrays. as. List(right. Tokens)); int same. Word. Count = 0; String. Builder words = new String. Builder(); for (String s : left. Set) { if (right. Set. contains(s)) { words. append(s + ", "); ++same. Word. Count; } if (same. Word. Count > 2) { outkey. set(words + "t" + key); output. collect(outkey, value); } } } 2015 -06 -04 55 / 56

END 2015 -06 -04

END 2015 -06 -04