Documenti di Didattica
Documenti di Professioni
Documenti di Cultura
package rc.hadoop.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
}
// Record count reducer
package rc.hadoop.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
}
// Record count driver
package rc.hadoop.it.bec;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
System.exit(exitCode);
}
@Override
public int run(String[] args2) throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName("Record Count");
job.setJarByClass(this.getClass());
}
2. Write a mapreduce program in Java to find number of cards of each type in a
given deck of playing cards.
package cc.hadoop.training.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
}
// Card count reducer
package cc.hadoop.training.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
}
}
// Card count driver
package cc.hadoop.training.it.bec;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
System.exit(exitCode);
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName("Card Count");
job.setJarByClass(this.getClass());
job.setMapperClass(CardCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setCombinerClass(LongSumReducer.class);
job.setReducerClass(LongSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
}
3. Write a mapreduce program in Java to find word count in a given set of text
files.
package wc.hadoop.training.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
}
}
// Word count reducer
package wc.hadoop.training.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
}
// Word count driver
package wc.hadoop.training.it.bec;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
@Override
public int run(String[] args) throws Exception {
jb1.setJobName("Word Count");
jb1.setJarByClass(WordCountJob.class);
jb1.setMapperClass(WordCountMapper.class);
jb1.setReducerClass(WordCountReducer.class);
jb1.setMapOutputKeyClass(Text.class);
jb1.setMapOutputValueClass(IntWritable.class);
jb1.setOutputKeyClass(Text.class);
jb1.setOutputValueClass(IntWritable.class);
if(jb1.waitForCompletion(true)) {
Job jb2 = Job.getInstance(getConf());
jb1.setOutputKeyClass(IntWritable.class);
jb1.setOutputValueClass(Text.class);
package wcmr.hadoop.training.it.bec;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
}
}
}
}
package wcmr.hadoop.training.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
@Override
public int run(String[] args) throws Exception {
Job jb1 = Job.getInstance(getConf());
jb1.setJobName("Word Count");
jb1.setJarByClass(WCMR.class);
jb1.setMapperClass(WCMR.WordCountMapper.class);
jb1.setReducerClass(WCMR.WordCountReducer.class);
jb1.setMapOutputKeyClass(Text.class);
jb1.setMapOutputValueClass(IntWritable.class);
jb1.setOutputFormatClass(TextOutputFormat.class);
jb1.setOutputKeyClass(Text.class);
jb1.setOutputValueClass(IntWritable.class);
jb2.setOutputKeyClass(IntWritable.class);
jb2.setOutputValueClass(Text.class);
if (jb1.waitForCompletion(true)) {
return (jb2.waitForCompletion(true) ? 0 : 1);
}
return 1;
}
}
5. Write Pig Latin script to find out top K frequent words in a given set of text
files. (Note: Add description to the code where ever it is required)
-- Load input from the folder named samip, and call the single
-- field in the record 'rline'.
RBL = LOAD '/usr/nsrp/samip/' USING TextLoader() AS (rline:chararray);
-- Count them.
WC = FOREACH WG GENERATE group, COUNT(BW);
SWC = ORDER WC BY $1 DESC,$0 ASC;
TKFW = LIMIT SWC 5;
% max_temp_param.pig
package util.pig.it.bec;
import org.apache.pig.PrimitiveEvalFunc;
@Override
public String exec(String input) {
return input.trim();
Assume the name of the JAR file created from the above program is mypigutil.jar and
it is located in the directory whose path is /home/udf/pig. The JAR file has to be
registered with the Pig framework by using the following command.
REGISTER /home/udf/pig/mypigutil.jar;
If A is a relation with schema (fruit : chararray) and contains the following tuples
( pomegranate)
(banana )
(apple)
( lychee )
(pomegranate)
(banana)
(apple)
(lychee)
8. Write a program in Pig Latin using macro to find the register numbers of
students who failed in a given subject.
A = LOAD 'Y15IIISEM.txt' AS
(rn:int,regn:chararray,name:chararray,s1e:int,s1i:int,s2e:int,s2i:int,s3e:int,s3i
:int,s4e:int,s4i:int,s5e:int,s5i:int,s6e:int,s6i:int,s7e:int,s7i:int,s8e:int,s8i:
int,s9e:int,s9i:int);
S1F = filter_by_subject(A,s1e,s1i);
S2F = filter_by_subject(A,s2e,s2i);
DUMP S2F;
9. Write a program in Pig Latin using user defined function (UDF) to find the
register numbers of students who failed in a given subject.
package util.pig.it.bec;
import java.io.IOException;
import org.apache.pig.FilterFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
@Override
public Boolean exec(Tuple st) throws IOException {
if (st == null || st.size() == 0) {
return false;
try {
Object extm = st.get(0);
Object intm = st.get(1);
return false;
} catch (ExecException e) {
}
}
}
Assume the name of the JAR file created from the above program is mypigutil.jar and
it is located in the directory whose path is /home/udf/pig. The JAR file has to be
registered with the Pig framework by using the following command.
REGISTER /home/udf/pig/mypigutil.jar;
A = LOAD 'Y15IIISEM.txt' AS
(rn:int,regn:chararray,name:chararray,s1e:int,s1i:int,s2e:int,s2i:int,s3e:int,s3i:int,s
4e:int,s4i:int,s5e:int,s5i:int,s6e:int,s6i:int,s7e:int,s7i:int,s8e:int,s8i:int,s9e:int,
s9i:int);
Consider two small demonstration tables, sales (which lists the names of people
and the IDs of the items they bought) and things (which lists the item IDs and
their names):
OUTER JOINS : Outer joins helps to find mismatches between the tuples of
the tables being joined. If the join type is LEFT OUTER JOIN, the query
will return a row for every row in the left table (sales), even if there is
no corresponding row in the table it is being joined to (things):
If the join type is RIGHT OUTER JOIN, the query will return a row for every
row in the right table (things), even if there is no corresponding row in
the table it is being joined to (sales):
Hive also supports full outer joins, where a tuple in each table is placed
in the output whether are not there is a matched tuple in other table:
SEMI JOINS : This join operation is used to filter tuples in one table
based on condition defined on other table. For example to find all the
items in the things table that are in the sales table a semi join can be
performed between tables as follows:
package util.pig.it.bec;
import java.io.IOException;
import java.util.List;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
ranges = Range.parse(cutPattern);
@Override
public void setLocation(String location, Job job)
throws IOException {
FileInputFormat.setInputPaths(job, location);
@Override
public InputFormat getInputFormat() {
@Override
public void prepareToRead(RecordReader reader, PigSplit split) {
this.reader = reader;
@Override
public Tuple getNext() throws IOException {
try {
if (!reader.nextKeyValue()) {
return null;
LOG.warn(String.format(
range.getEnd(), line.length()));
continue;
return tuple;
} catch (InterruptedException e) {
}
package util.pig.it.bec;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.FilterFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
@Override
public Boolean exec(Tuple tuple) throws IOException {
return false;
try {
if (object == null) {
return false;
return i == 0 || i == 1 || i == 4 || i == 5 || i == 9;
} catch (ExecException e) {
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
funcSpecs.add(new FuncSpec(this.getClass().getName(),
return funcSpecs;
}
package util.pig.it.bec;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
this.start = start;
this.end = end;
return start;
return end;
@Override
public int hashCode() {
@Override
public boolean equals(Object obj) {
return false;
if (rangeSpec.length() == 0) {
return Collections.emptyList();
try {
ranges.add(new Range(Integer.parseInt(split[0]),
Integer.parseInt(split[1])));
} catch (NumberFormatException e) {
return ranges;
}
package edu.bec.it.abd.se
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object wordcount {
def main(args: Array[String]): Unit = {
if(args.length < 2) {
println("Missing arguments")
}
val conf = new SparkConf()
.setAppName("Word Count")
.setMaster("local[*]")
}
}
package edu.bec.it.hudf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
}
Write HiveQL script to find out top K frequent words in a given set of text files.
(Note: Add description to the code where ever it is required)
Describe complex data types array,map and struct in Hive with an example script.