ADA Lab Manual

1.
Write a mapreduce program in Java to find number of records in the input
// Record count mapper
package rc.hadoop.it.bec;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class RowCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

public void map(LongWritable recaddress, Text record, Context con) throws
IOException, InterruptedException {
con.write(new Text("count"), new IntWritable(1));
}
}
// Record count reducer
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class RowCountReducer extends Reducer<Text, IntWritable, NullWritable,

IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context con) throws
int sum=0;
for(IntWritable val : values) {
sum = sum + val.get();
}
con.write(NullWritable.get(), new IntWritable(sum));
}
// Record count driver
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class RowCountJob extends Configured implements Tool{
public static void main(String[] args1) throws Exception {
int exitCode = ToolRunner.run(new RowCountJob(), args1);
System.exit(exitCode);
}
@Override
public int run(String[] args2) throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName("Record Count");
job.setJarByClass(this.getClass());
FileInputFormat.setInputPaths(job, new Path(args2[0]));

job.setMapperClass(RowCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(RowCountReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args2[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
2. Write a mapreduce program in Java to find number of cards of each type in a
given deck of playing cards.
// Card count mapper
package cc.hadoop.training.it.bec;
public class CardCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

public void map(LongWritable recadd, Text rec, Context con) throws IOException,
InterruptedException {
String line = rec.toString();
String[] tokens = line.split("\\|");
con.write(new Text(tokens[0]), new LongWritable(1));
}
}
// Card count reducer
public class CardCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{

public void reduce(Text key, Iterable<LongWritable> values, Context con) throws
long sum=0;
for(LongWritable value : values) {
sum = sum + value.get();
}
con.write(key, new LongWritable(sum));
}
}
// Card count driver
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
public class CardCountJob extends Configured implements Tool {

public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new CardCountJob(), args);
System.exit(exitCode);
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName("Card Count");
job.setJarByClass(this.getClass());
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setMapperClass(CardCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setCombinerClass(LongSumReducer.class);
job.setReducerClass(LongSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
}
3. Write a mapreduce program in Java to find word count in a given set of text
files.
// Word count mapper
package wc.hadoop.training.it.bec;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

private String tokens = "[_|$#<>\\^=\\[\\]\\*/\\\\,;,.\\-:()?!\"']";
public void map(LongWritable recadd, Text rec, Context con) throws IOException,
InterruptedException {
String cleanline = rec.toString().toLowerCase().replaceAll(tokens, " ");
String[] words = cleanline.split(" ");
for(String str : words) {
con.write(new Text(str.trim()), new IntWritable(1));
}
}
}
// Word count reducer
public class WordCountReducer extends

Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context con)

throws IOException, InterruptedException {
int sum = 0;
for (IntWritable el : values) {
sum = sum + el.get();
}
con.write(key, new IntWritable(sum));
}
}
// Word count driver
public class WordCountJob extends Configured implements Tool {

public static void main(String[] cla) throws Exception {
int exitstatus = ToolRunner.run(new WordCountJob(), cla);
System.exit(exitstatus);
}
@Override
Job jb1 = Job.getInstance(getConf());
jb1.setJobName("Word Count");
jb1.setJarByClass(WordCountJob.class);
jb1.setMapperClass(WordCountMapper.class);
jb1.setReducerClass(WordCountReducer.class);
jb1.setMapOutputKeyClass(Text.class);
jb1.setMapOutputValueClass(IntWritable.class);
jb1.setOutputKeyClass(Text.class);
jb1.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(jb1, new Path(args[0]));

FileOutputFormat.setOutputPath(jb1, new Path(args[1]));
if(jb1.waitForCompletion(true)) {
jb2.setJobName("Word Count Sorter");

jb2.setJarByClass(WordCountJob.class);
jb2.setMapperClass(WCMTwo.class);
jb2.setReducerClass(WCRTwo.class);
jb2.setMapOutputKeyClass(IntWritable.class);
jb2.setMapOutputValueClass(Text.class);
jb1.setOutputKeyClass(IntWritable.class);
jb1.setOutputValueClass(Text.class);

if(jb2.waitForCompletion(true)) {
return 0;
}
else return 1;
}
else return 1;
}
}
4. Write a mapreduce program in Java to calculate word frequencies in a given set
of text files and arrange them in decreasing order of their frequencies?
package wcmr.hadoop.training.it.bec;
import java.util.StringTokenizer;
public class WCMR {
public static class WordCountMapper extends

Mapper<LongWritable, Text, Text, IntWritable> {
protected String delimeters = " , .;:'\"&!?-_\n\t12345678910[]{}<>\\`~|
=^()@#$%^*/+-";
protected static boolean caseSensitive = false;
public void map(LongWritable recadd, Text rec, Context con)

String line = (caseSensitive) ? rec.toString() :
rec.toString().toLowerCase();
StringTokenizer tokenizer = new StringTokenizer(line, delimeters);
while (tokenizer.hasMoreTokens()) {
con.write(new Text(tokenizer.nextToken()), new
IntWritable(1));
}
}
}
public static class WordCountReducer extends

Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context con)

int sum = 0;
for (IntWritable el : values) {
sum = sum + el.get();
}
con.write(key, new IntWritable(sum));
}
}
}
public class WSMR {

public static class WCSMapper extends
Mapper<LongWritable, Text, IntWritable, Text> {
public void map(LongWritable recadd, Text rec, Context con)
String[] strs = rec.toString().trim().split("\t");
con.write(new IntWritable(Integer.parseInt(strs[1])), new
Text(strs[0]));
}
}
public static class WCSReducer extends

Reducer<IntWritable, Text, IntWritable, Text> {
public void reduce(IntWritable key, Iterable<Text> values, Context con)
for (Text el : values) {
con.write(key, el);
}
}
}
}
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WCS extends Configured implements Tool {

public static void main(String[] cla) throws Exception {
int exitstatus = ToolRunner.run(new WCS(), cla);
System.exit(exitstatus);
}
@Override
jb1.setJobName("Word Count");
jb1.setJarByClass(WCMR.class);
jb1.setMapperClass(WCMR.WordCountMapper.class);
jb1.setReducerClass(WCMR.WordCountReducer.class);
jb1.setMapOutputKeyClass(Text.class);
jb1.setMapOutputValueClass(IntWritable.class);
jb1.setOutputFormatClass(TextOutputFormat.class);
jb1.setOutputKeyClass(Text.class);
jb1.setOutputValueClass(IntWritable.class);


jb2.setJobName("Word Count Sorter");
jb2.setJarByClass(WSMR.class);
jb2.setMapperClass(WSMR.WCSMapper.class);
jb2.setReducerClass(WSMR.WCSReducer.class);
jb2.setMapOutputKeyClass(IntWritable.class);
jb2.setMapOutputValueClass(Text.class);
jb2.setOutputKeyClass(IntWritable.class);
jb2.setOutputValueClass(Text.class);

if (jb1.waitForCompletion(true)) {
return (jb2.waitForCompletion(true) ? 0 : 1);
}
return 1;
}
}
5. Write Pig Latin script to find out top K frequent words in a given set of text
files. (Note: Add description to the code where ever it is required)
-- Load input from the folder named samip, and call the single
-- field in the record 'rline'.
RBL = LOAD '/usr/nsrp/samip/' USING TextLoader() AS (rline:chararray);
BL = FOREACH RBL GENERATE REPLACE(rline,'([^a-zA-Z\\s]+)','') AS line:chararray;
-- TOKENIZE splits the line into a field for each word.

-- flatten will take the collection of records returned by
-- TOKENIZE and produce a separate record for each one, calling the single
-- field in the record word.
BW = FOREACH BL GENERATE FLATTEN(TOKENIZE(line)) AS word;
-- Now group them together by each word.

WG = GROUP BW BY word;
DW = GROUP WG ALL;
DWC = FOREACH DW GENERATE group,COUNT(WG);
-- Count them.
WC = FOREACH WG GENERATE group, COUNT(BW);
SWC = ORDER WC BY $1 DESC,$0 ASC;
TKFW = LIMIT SWC 5;
-- Print out the results.

DUMP @;
6. Write a Pig Latin script to find maximum temperature in a given year given the
weather data with three fields Year, Temperature and Quality Parameter (QP) per
record as depicted in Table 1. Discard the records with temperature value 9999
or QP value in 2, 3, 6, 7, 8. The range of QP is 0 to 9.
% max_temp_param.pig
records = LOAD '$input';

AS (Year:chararray, Temperature:int, QP:int);
filtered_records = FILTER records BY Temperature != 9999 AND
QP IN (0, 1, 4, 5, 9);
grouped_records = GROUP filtered_records BY year;
max_temp = FOREACH grouped_records GENERATE group,
MAX(filtered_records.temperature);
STORE max_temp into '$output';
DUMP max_temp;
% pig -param input=/input/sample.txt -param output=/tmp/out max_temp_param.pig

7. Implement an EvalFunc UDF in Pig Latin to trim leading and trailing whitespace
from char array values and explain the procedure to call the UDF in a Pig Latin
script.
package util.pig.it.bec;
import org.apache.pig.PrimitiveEvalFunc;
public class Trim extends PrimitiveEvalFunc<String, String> {
@Override
public String exec(String input) {
return input.trim();
Assume the name of the JAR file created from the above program is mypigutil.jar and
it is located in the directory whose path is /home/udf/pig. The JAR file has to be
registered with the Pig framework by using the following command.
REGISTER /home/udf/pig/mypigutil.jar;
If A is a relation with schema (fruit : chararray) and contains the following tuples
( pomegranate)
(banana )
(apple)
( lychee )
The Trim function can be invoked on the tuples in relation A as follows
B = FOREACH A GENERATE util.pig.bec.Trim(fruit);
The relation B contains the following tuples.
(pomegranate)
(banana)
(apple)
(lychee)
8. Write a program in Pig Latin using macro to find the register numbers of
students who failed in a given subject.
A = LOAD 'Y15IIISEM.txt' AS
(rn:int,regn:chararray,name:chararray,s1e:int,s1i:int,s2e:int,s2i:int,s3e:int,s3i
:int,s4e:int,s4i:int,s5e:int,s5i:int,s6e:int,s6i:int,s7e:int,s7i:int,s8e:int,s8i:
int,s9e:int,s9i:int);
--F1S = ORDER A BY s1e DESC, s1i ASC;
DEFINE filter_by_subject(marks, f_key1, f_key2) RETURNS Y {

B = FILTER $marks BY ($f_key1 < 24) OR (($f_key1+$f_key2 < 40));
$Y = FOREACH B GENERATE regn;
};
-- Failures in first subject
S1F = filter_by_subject(A,s1e,s1i);
-- Failures in second subject
S2F = filter_by_subject(A,s2e,s2i);
DUMP S2F;
9. Write a program in Pig Latin using user defined function (UDF) to find the
register numbers of students who failed in a given subject.
// Class which implements user defined function
import org.apache.pig.FilterFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
public class GetFR extends FilterFunc {
@Override
public Boolean exec(Tuple st) throws IOException {
if (st == null || st.size() == 0) {
return false;
try {
Object extm = st.get(0);
Object intm = st.get(1);
if ((extm == null) | (intm == null)) {
return false;
int e = (Integer) extm;

int i = (Integer) intm;
if ((e < 24) | (e + i < 40)) {
return true;
} else {
return false;
}
} catch (ExecException e) {
throw new IOException(e);
}
}
}
Assume the name of the JAR file created from the above program is mypigutil.jar and
it is located in the directory whose path is /home/udf/pig. The JAR file has to be
registered with the Pig framework by using the following command.
-- Pig Latin script to call the user defined function
REGISTER /home/udf/pig/mypigutil.jar;
--Using DEFINE assign a name to your function;
DEFINE myf util.pig.it.bec.GetFR();
A = LOAD 'Y15IIISEM.txt' AS
(rn:int,regn:chararray,name:chararray,s1e:int,s1i:int,s2e:int,s2i:int,s3e:int,s3i:int,s
4e:int,s4i:int,s5e:int,s5i:int,s6e:int,s6i:int,s7e:int,s7i:int,s8e:int,s8i:int,s9e:int,
s9i:int);
S1F = FILTER A BY myf(s1e,s1i);

R = FOREACH S1F GENERATE $1;
DUMP R;
10. Write HiveQL script to find out top K frequent words in a given set of text
files. (Note: Add description to the code where ever it is required)
CREATE TABLE book (content STRING)

ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' '
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '/usr/nsrp/samip' OVERWRITE INTO TABLE book;
CREATE TABLE cbook AS

SELECT regexp_replace(trim(lower(content)),"[^A-Za-z0-9\\s]+","") AS word
FROM book;
SELECT word, COUNT(1) AS wc

FROM cbook
GROUP BY word
ORDER BY wc DESC
LIMIT 5
11. How joins are performed between Tables in Hive?
Consider two small demonstration tables, sales (which lists the names of people
and the IDs of the items they bought) and things (which lists the item IDs and
their names):
An inner join on the two tables can be performed as follows:
OUTER JOINS : Outer joins helps to find mismatches between the tuples of
the tables being joined. If the join type is LEFT OUTER JOIN, the query
will return a row for every row in the left table (sales), even if there is
no corresponding row in the table it is being joined to (things):
If the join type is RIGHT OUTER JOIN, the query will return a row for every
row in the right table (things), even if there is no corresponding row in
the table it is being joined to (sales):
Hive also supports full outer joins, where a tuple in each table is placed
in the output whether are not there is a matched tuple in other table:
SEMI JOINS : This join operation is used to filter tuples in one table
based on condition defined on other table. For example to find all the
items in the things table that are in the sales table a semi join can be
performed between tables as follows:
import java.util.List;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.TupleFactory;
public class CutLoadFunc extends LoadFunc {
private static final Log LOG = LogFactory.getLog(CutLoadFunc.class);
private final List<Range> ranges;
private final TupleFactory tupleFactory = TupleFactory.getInstance();
private RecordReader reader;
public CutLoadFunc(String cutPattern) {
ranges = Range.parse(cutPattern);
@Override
public void setLocation(String location, Job job)
throws IOException {
FileInputFormat.setInputPaths(job, location);
@Override
public InputFormat getInputFormat() {
return new TextInputFormat();
@Override
public void prepareToRead(RecordReader reader, PigSplit split) {
this.reader = reader;
@Override
public Tuple getNext() throws IOException {
try {
if (!reader.nextKeyValue()) {
return null;
Text value = (Text) reader.getCurrentValue();
String line = value.toString();
Tuple tuple = tupleFactory.newTuple(ranges.size());
for (int i = 0; i < ranges.size(); i++) {
Range range = ranges.get(i);
if (range.getEnd() > line.length()) {
LOG.warn(String.format(
"Range end (%s) is longer than line length (%s)",
range.getEnd(), line.length()));
continue;
tuple.set(i, new DataByteArray(range.getSubstring(line)));
return tuple;
} catch (InterruptedException e) {
throw new ExecException(e);
}
import java.util.ArrayList;
import org.apache.pig.FilterFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
public class IsGoodQuality extends FilterFunc {
@Override
public Boolean exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() == 0) {
return false;
try {
Object object = tuple.get(0);
if (object == null) {
return false;
int i = (Integer) object;
return i == 0 || i == 1 || i == 4 || i == 5 || i == 9;
} catch (ExecException e) {
throw new IOException(e);
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
List<FuncSpec> funcSpecs = new ArrayList<FuncSpec>();
funcSpecs.add(new FuncSpec(this.getClass().getName(),
new Schema(new Schema.FieldSchema(null, DataType.INTEGER))));
return funcSpecs;
}
import java.util.ArrayList;
import java.util.Collections;
public class Range {
private final int start;
private final int end;
public Range(int start, int end) {
this.start = start;
this.end = end;
public int getStart() {
return start;
public int getEnd() {
return end;
public String getSubstring(String line) {
return line.substring(start - 1, end);
@Override
public int hashCode() {
return start * 37 + end;
@Override
public boolean equals(Object obj) {
if (!(obj instanceof Range)) {
return false;
Range other = (Range) obj;
return this.start == other.start && this.end == other.end;
public static List<Range> parse(String rangeSpec)

throws IllegalArgumentException {
if (rangeSpec.length() == 0) {
return Collections.emptyList();
List<Range> ranges = new ArrayList<Range>();
String[] specs = rangeSpec.split(",");
for (String spec : specs) {
String[] split = spec.split("-");
try {
ranges.add(new Range(Integer.parseInt(split[0]),
Integer.parseInt(split[1])));
} catch (NumberFormatException e) {
throw new IllegalArgumentException(e.getMessage());
return ranges;
}
package edu.bec.it.abd.se
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object wordcount {
def main(args: Array[String]): Unit = {
if(args.length < 2) {
println("Missing arguments")
}
val conf = new SparkConf()
.setAppName("Word Count")
.setMaster("local[*]")
val sc = new SparkContext(conf)

val iptext = sc.textFile(args(0)).map(rl => rl.trim()).map(rl => rl.toLowerCase())
val words = iptext.flatMap(line => line.split(" "))
val iwc = words.map(word => (word, 1))
val cwc = iwc.reduceByKey((v1,v2) => v1+v2)
val swc = cwc.sortByKey(true)
swc.foreach(println)
swc.saveAsTextFile(args(1))
}
}
package edu.bec.it.hudf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
public class Strip extends UDF {

private Text result = new Text();
public Text evaluate(Text str, int si, int ei) {

if (str == null) {
return null;
}
result.set(StringUtils.substring(str.toString(), si, ei));
return result;
}
public Text evaluate(Text str) {

if (str == null) {
return null;
}
result.set(StringUtils.strip(str.toString()));
return result;
}
public Text evaluate(Text str, String stripChars) {

if (str == null) {
return null;
}
result.set(StringUtils.strip(str.toString(), stripChars));
return result;
}
}
package edu.bec.it.hudf;
import org.apache.hadoop.hive.ql.exec. UDAF;

import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.FloatWritable;
public class Mean extends UDAF {

public static class MeanFloatUDAFEvaluator implements UDAFEvaluator {
public static class PartialResult {
float sum;
long count;
}
private PartialResult partial;
public void init() {

partial = null;
}
public boolean iterate(FloatWritable value) {

if (value == null) {
return true;
}
if (partial == null) {
partial = new PartialResult();
}
partial.sum += value.get();
partial.count++;
return true;
}
public PartialResult terminatePartial() {

return partial;
}
public boolean merge(PartialResult other) {

if (other == null) {
return true;
}
}
partial.sum += other.sum;
partial.count += other.count;
return true;
}
public FloatWritable terminate() {

return null;
}
return new FloatWritable(partial.sum / partial.count);
}
}
public static class MeanIntUDAFEvaluator implements UDAFEvaluator {

public static class PartialResult {
int sum;
long count;
}
private PartialResult partial;

public void init() {
partial = null;
}
public boolean iterate(FloatWritable value) {

if (value == null) {
return true;
}
}
partial.sum += value.get();
partial.count++;
return true;
}
public PartialResult terminatePartial() {

return partial;
}
public boolean merge(PartialResult other) {

if (other == null) {
return true;
}
}
partial.sum += other.sum;
partial.count += other.count;
return true;
}
public FloatWritable terminate() {

return null;
}
return new FloatWritable(partial.sum / partial.count);
}
}
}
Write HiveQL script to find out top K frequent words in a given set of text files.
(Note: Add description to the code where ever it is required)
Describe complex data types array,map and struct in Hive with an example script.
How joins are performed between Tables in Hive?
Write about COGROUP in Hive
How table partitioning improves Hive query performance?
Describe Sqoop import command in detail.

ADA Lab Manual

Caricato da

Informazioni sul documento

Copyright

Formati disponibili

Condividi questo documento

Condividi o incorpora il documento

Opzioni di condivisione

Hai trovato utile questo documento?

Questo contenuto è inappropriato?

Copyright:

Formati disponibili

ADA Lab Manual

Caricato da

Copyright:

Formati disponibili

1.

Write a mapreduce program in Java to find number of records in the input

// Record count mapper

public class RowCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

public class RowCountReducer extends Reducer<Text, IntWritable, NullWritable,

con.write(NullWritable.get(), new IntWritable(sum));

public class RowCountJob extends Configured implements Tool{

public static void main(String[] args1) throws Exception {

int exitCode = ToolRunner.run(new RowCountJob(), args1);

FileInputFormat.setInputPaths(job, new Path(args2[0]));

// Card count mapper

public class CardCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

public class CardCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{

con.write(key, new LongWritable(sum));

public class CardCountJob extends Configured implements Tool {

int exitCode = ToolRunner.run(new CardCountJob(), args);

FileInputFormat.setInputPaths(job, new Path(args[0]));

// Word count mapper

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

public class WordCountReducer extends

public void reduce(Text key, Iterable<IntWritable> values, Context con)

public class WordCountJob extends Configured implements Tool {

Job jb1 = Job.getInstance(getConf());

FileInputFormat.setInputPaths(jb1, new Path(args[0]));

jb2.setJobName("Word Count Sorter");

FileInputFormat.setInputPaths(jb1, new Path(args[1]));

public class WCMR {

public static class WordCountMapper extends

public void map(LongWritable recadd, Text rec, Context con)

public static class WordCountReducer extends

public void reduce(Text key, Iterable<IntWritable> values, Context con)

public class WSMR {

public static class WCSReducer extends

public class WCS extends Configured implements Tool {

FileInputFormat.setInputPaths(jb1, new Path(args[0]));

Job jb2 = Job.getInstance(getConf());

FileInputFormat.setInputPaths(jb2, new Path(args[1]));

BL = FOREACH RBL GENERATE REPLACE(rline,'([^a-zA-Z\\s]+)','') AS line:chararray;

-- TOKENIZE splits the line into a field for each word.

-- Now group them together by each word.

-- Print out the results.

records = LOAD '$input';

% pig -param input=/input/sample.txt -param output=/tmp/out max_temp_param.pig

public class Trim extends PrimitiveEvalFunc<String, String> {

The Trim function can be invoked on the tuples in relation A as follows

B = FOREACH A GENERATE util.pig.bec.Trim(fruit);

The relation B contains the following tuples.

--F1S = ORDER A BY s1e DESC, s1i ASC;

DEFINE filter_by_subject(marks, f_key1, f_key2) RETURNS Y {

-- Failures in first subject

-- Failures in second subject

// Class which implements user defined function

public class GetFR extends FilterFunc {

if ((extm == null) | (intm == null)) {

int e = (Integer) extm;

throw new IOException(e);

-- Pig Latin script to call the user defined function

--Using DEFINE assign a name to your function;

DEFINE myf util.pig.it.bec.GetFR();

S1F = FILTER A BY myf(s1e,s1i);

CREATE TABLE book (content STRING)

LOAD DATA LOCAL INPATH '/usr/nsrp/samip' OVERWRITE INTO TABLE book;

CREATE TABLE cbook AS

SELECT word, COUNT(1) AS wc