Sei sulla pagina 1di 34

1.

Write a mapreduce program in Java to find number of records in the input

// Record count mapper

package rc.hadoop.it.bec;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class RowCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {


public void map(LongWritable recaddress, Text record, Context con) throws
IOException, InterruptedException {
con.write(new Text("count"), new IntWritable(1));
}

}
// Record count reducer

package rc.hadoop.it.bec;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RowCountReducer extends Reducer<Text, IntWritable, NullWritable,


IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context con) throws
IOException, InterruptedException {
int sum=0;
for(IntWritable val : values) {
sum = sum + val.get();
}

con.write(NullWritable.get(), new IntWritable(sum));

}
// Record count driver

package rc.hadoop.it.bec;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class RowCountJob extends Configured implements Tool{

public static void main(String[] args1) throws Exception {

int exitCode = ToolRunner.run(new RowCountJob(), args1);

System.exit(exitCode);
}

@Override
public int run(String[] args2) throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName("Record Count");

job.setJarByClass(this.getClass());

FileInputFormat.setInputPaths(job, new Path(args2[0]));


job.setMapperClass(RowCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(RowCountReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args2[1]));
return job.waitForCompletion(true) ? 0 : 1;

}
2. Write a mapreduce program in Java to find number of cards of each type in a
given deck of playing cards.

// Card count mapper

package cc.hadoop.training.it.bec;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CardCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {


public void map(LongWritable recadd, Text rec, Context con) throws IOException,
InterruptedException {
String line = rec.toString();
String[] tokens = line.split("\\|");
con.write(new Text(tokens[0]), new LongWritable(1));
}

}
// Card count reducer

package cc.hadoop.training.it.bec;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class CardCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{


public void reduce(Text key, Iterable<LongWritable> values, Context con) throws
IOException, InterruptedException {
long sum=0;
for(LongWritable value : values) {
sum = sum + value.get();
}

con.write(key, new LongWritable(sum));

}
}
// Card count driver

package cc.hadoop.training.it.bec;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class CardCountJob extends Configured implements Tool {


public static void main(String[] args) throws Exception {

int exitCode = ToolRunner.run(new CardCountJob(), args);

System.exit(exitCode);
}

@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName("Card Count");

job.setJarByClass(this.getClass());

FileInputFormat.setInputPaths(job, new Path(args[0]));

job.setMapperClass(CardCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);

job.setCombinerClass(LongSumReducer.class);

job.setReducerClass(LongSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}

}
3. Write a mapreduce program in Java to find word count in a given set of text
files.

// Word count mapper

package wc.hadoop.training.it.bec;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{


private String tokens = "[_|$#<>\\^=\\[\\]\\*/\\\\,;,.\\-:()?!\"']";
public void map(LongWritable recadd, Text rec, Context con) throws IOException,
InterruptedException {
String cleanline = rec.toString().toLowerCase().replaceAll(tokens, " ");
String[] words = cleanline.split(" ");
for(String str : words) {
con.write(new Text(str.trim()), new IntWritable(1));
}

}
}
// Word count reducer

package wc.hadoop.training.it.bec;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends


Reducer<Text, IntWritable, Text, IntWritable> {

public void reduce(Text key, Iterable<IntWritable> values, Context con)


throws IOException, InterruptedException {
int sum = 0;
for (IntWritable el : values) {
sum = sum + el.get();
}
con.write(key, new IntWritable(sum));
}

}
// Word count driver

package wc.hadoop.training.it.bec;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCountJob extends Configured implements Tool {


public static void main(String[] cla) throws Exception {
int exitstatus = ToolRunner.run(new WordCountJob(), cla);
System.exit(exitstatus);
}

@Override
public int run(String[] args) throws Exception {

Job jb1 = Job.getInstance(getConf());

jb1.setJobName("Word Count");
jb1.setJarByClass(WordCountJob.class);
jb1.setMapperClass(WordCountMapper.class);
jb1.setReducerClass(WordCountReducer.class);
jb1.setMapOutputKeyClass(Text.class);
jb1.setMapOutputValueClass(IntWritable.class);

jb1.setOutputKeyClass(Text.class);
jb1.setOutputValueClass(IntWritable.class);

FileInputFormat.setInputPaths(jb1, new Path(args[0]));


FileOutputFormat.setOutputPath(jb1, new Path(args[1]));

if(jb1.waitForCompletion(true)) {
Job jb2 = Job.getInstance(getConf());

jb2.setJobName("Word Count Sorter");


jb2.setJarByClass(WordCountJob.class);
jb2.setMapperClass(WCMTwo.class);
jb2.setReducerClass(WCRTwo.class);
jb2.setMapOutputKeyClass(IntWritable.class);
jb2.setMapOutputValueClass(Text.class);

jb1.setOutputKeyClass(IntWritable.class);
jb1.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(jb1, new Path(args[1]));


FileOutputFormat.setOutputPath(jb1, new Path(args[2]));
if(jb2.waitForCompletion(true)) {
return 0;
}
else return 1;
}
else return 1;
}
}
4. Write a mapreduce program in Java to calculate word frequencies in a given set
of text files and arrange them in decreasing order of their frequencies?

package wcmr.hadoop.training.it.bec;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

public class WCMR {

public static class WordCountMapper extends


Mapper<LongWritable, Text, Text, IntWritable> {
protected String delimeters = " , .;:'\"&!?-_\n\t12345678910[]{}<>\\`~|
=^()@#$%^*/+-";
protected static boolean caseSensitive = false;

public void map(LongWritable recadd, Text rec, Context con)


throws IOException, InterruptedException {
String line = (caseSensitive) ? rec.toString() :
rec.toString().toLowerCase();
StringTokenizer tokenizer = new StringTokenizer(line, delimeters);
while (tokenizer.hasMoreTokens()) {
con.write(new Text(tokenizer.nextToken()), new
IntWritable(1));
}

}
}

public static class WordCountReducer extends


Reducer<Text, IntWritable, Text, IntWritable> {

public void reduce(Text key, Iterable<IntWritable> values, Context con)


throws IOException, InterruptedException {
int sum = 0;
for (IntWritable el : values) {
sum = sum + el.get();
}
con.write(key, new IntWritable(sum));
}

}
}
package wcmr.hadoop.training.it.bec;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

public class WSMR {


public static class WCSMapper extends
Mapper<LongWritable, Text, IntWritable, Text> {
public void map(LongWritable recadd, Text rec, Context con)
throws IOException, InterruptedException {
String[] strs = rec.toString().trim().split("\t");
con.write(new IntWritable(Integer.parseInt(strs[1])), new
Text(strs[0]));
}
}

public static class WCSReducer extends


Reducer<IntWritable, Text, IntWritable, Text> {
public void reduce(IntWritable key, Iterable<Text> values, Context con)
throws IOException, InterruptedException {
for (Text el : values) {
con.write(key, el);
}
}
}
}
package wcmr.hadoop.training.it.bec;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WCS extends Configured implements Tool {


public static void main(String[] cla) throws Exception {
int exitstatus = ToolRunner.run(new WCS(), cla);
System.exit(exitstatus);
}

@Override
public int run(String[] args) throws Exception {
Job jb1 = Job.getInstance(getConf());

jb1.setJobName("Word Count");
jb1.setJarByClass(WCMR.class);
jb1.setMapperClass(WCMR.WordCountMapper.class);
jb1.setReducerClass(WCMR.WordCountReducer.class);
jb1.setMapOutputKeyClass(Text.class);
jb1.setMapOutputValueClass(IntWritable.class);
jb1.setOutputFormatClass(TextOutputFormat.class);
jb1.setOutputKeyClass(Text.class);
jb1.setOutputValueClass(IntWritable.class);

FileInputFormat.setInputPaths(jb1, new Path(args[0]));


FileOutputFormat.setOutputPath(jb1, new Path(args[1]));

Job jb2 = Job.getInstance(getConf());


jb2.setJobName("Word Count Sorter");
jb2.setJarByClass(WSMR.class);
jb2.setMapperClass(WSMR.WCSMapper.class);
jb2.setReducerClass(WSMR.WCSReducer.class);
jb2.setMapOutputKeyClass(IntWritable.class);
jb2.setMapOutputValueClass(Text.class);

jb2.setOutputKeyClass(IntWritable.class);
jb2.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(jb2, new Path(args[1]));


FileOutputFormat.setOutputPath(jb2, new Path(args[2]));

if (jb1.waitForCompletion(true)) {
return (jb2.waitForCompletion(true) ? 0 : 1);
}
return 1;
}
}
5. Write Pig Latin script to find out top K frequent words in a given set of text
files. (Note: Add description to the code where ever it is required)

-- Load input from the folder named samip, and call the single
-- field in the record 'rline'.
RBL = LOAD '/usr/nsrp/samip/' USING TextLoader() AS (rline:chararray);

BL = FOREACH RBL GENERATE REPLACE(rline,'([^a-zA-Z\\s]+)','') AS line:chararray;

-- TOKENIZE splits the line into a field for each word.


-- flatten will take the collection of records returned by
-- TOKENIZE and produce a separate record for each one, calling the single
-- field in the record word.
BW = FOREACH BL GENERATE FLATTEN(TOKENIZE(line)) AS word;

-- Now group them together by each word.


WG = GROUP BW BY word;
DW = GROUP WG ALL;
DWC = FOREACH DW GENERATE group,COUNT(WG);

-- Count them.
WC = FOREACH WG GENERATE group, COUNT(BW);
SWC = ORDER WC BY $1 DESC,$0 ASC;
TKFW = LIMIT SWC 5;

-- Print out the results.


DUMP @;
6. Write a Pig Latin script to find maximum temperature in a given year given the
weather data with three fields Year, Temperature and Quality Parameter (QP) per
record as depicted in Table 1. Discard the records with temperature value 9999
or QP value in 2, 3, 6, 7, 8. The range of QP is 0 to 9.

% max_temp_param.pig

records = LOAD '$input';


AS (Year:chararray, Temperature:int, QP:int);
filtered_records = FILTER records BY Temperature != 9999 AND
QP IN (0, 1, 4, 5, 9);
grouped_records = GROUP filtered_records BY year;
max_temp = FOREACH grouped_records GENERATE group,
MAX(filtered_records.temperature);
STORE max_temp into '$output';
DUMP max_temp;

% pig -param input=/input/sample.txt -param output=/tmp/out max_temp_param.pig


7. Implement an EvalFunc UDF in Pig Latin to trim leading and trailing whitespace
from char array values and explain the procedure to call the UDF in a Pig Latin
script.

package util.pig.it.bec;

import org.apache.pig.PrimitiveEvalFunc;

public class Trim extends PrimitiveEvalFunc<String, String> {

@Override
public String exec(String input) {

return input.trim();

Assume the name of the JAR file created from the above program is mypigutil.jar and
it is located in the directory whose path is /home/udf/pig. The JAR file has to be
registered with the Pig framework by using the following command.

REGISTER /home/udf/pig/mypigutil.jar;

If A is a relation with schema (fruit : chararray) and contains the following tuples

( pomegranate)
(banana )
(apple)
( lychee )

The Trim function can be invoked on the tuples in relation A as follows

B = FOREACH A GENERATE util.pig.bec.Trim(fruit);

The relation B contains the following tuples.

(pomegranate)
(banana)
(apple)
(lychee)
8. Write a program in Pig Latin using macro to find the register numbers of
students who failed in a given subject.

A = LOAD 'Y15IIISEM.txt' AS
(rn:int,regn:chararray,name:chararray,s1e:int,s1i:int,s2e:int,s2i:int,s3e:int,s3i
:int,s4e:int,s4i:int,s5e:int,s5i:int,s6e:int,s6i:int,s7e:int,s7i:int,s8e:int,s8i:
int,s9e:int,s9i:int);

--F1S = ORDER A BY s1e DESC, s1i ASC;

DEFINE filter_by_subject(marks, f_key1, f_key2) RETURNS Y {


B = FILTER $marks BY ($f_key1 < 24) OR (($f_key1+$f_key2 < 40));
$Y = FOREACH B GENERATE regn;
};

-- Failures in first subject

S1F = filter_by_subject(A,s1e,s1i);

-- Failures in second subject

S2F = filter_by_subject(A,s2e,s2i);

DUMP S2F;
9. Write a program in Pig Latin using user defined function (UDF) to find the
register numbers of students who failed in a given subject.

// Class which implements user defined function

package util.pig.it.bec;

import java.io.IOException;

import org.apache.pig.FilterFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;

public class GetFR extends FilterFunc {

@Override
public Boolean exec(Tuple st) throws IOException {
if (st == null || st.size() == 0) {

return false;

try {
Object extm = st.get(0);
Object intm = st.get(1);

if ((extm == null) | (intm == null)) {

return false;

int e = (Integer) extm;


int i = (Integer) intm;
if ((e < 24) | (e + i < 40)) {
return true;
} else {
return false;
}

} catch (ExecException e) {

throw new IOException(e);

}
}

}
Assume the name of the JAR file created from the above program is mypigutil.jar and
it is located in the directory whose path is /home/udf/pig. The JAR file has to be
registered with the Pig framework by using the following command.

-- Pig Latin script to call the user defined function

REGISTER /home/udf/pig/mypigutil.jar;

--Using DEFINE assign a name to your function;

DEFINE myf util.pig.it.bec.GetFR();

A = LOAD 'Y15IIISEM.txt' AS
(rn:int,regn:chararray,name:chararray,s1e:int,s1i:int,s2e:int,s2i:int,s3e:int,s3i:int,s
4e:int,s4i:int,s5e:int,s5i:int,s6e:int,s6i:int,s7e:int,s7i:int,s8e:int,s8i:int,s9e:int,
s9i:int);

S1F = FILTER A BY myf(s1e,s1i);


R = FOREACH S1F GENERATE $1;
DUMP R;
10. Write HiveQL script to find out top K frequent words in a given set of text
files. (Note: Add description to the code where ever it is required)

CREATE TABLE book (content STRING)


ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' '
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '/usr/nsrp/samip' OVERWRITE INTO TABLE book;

CREATE TABLE cbook AS


SELECT regexp_replace(trim(lower(content)),"[^A-Za-z0-9\\s]+","") AS word
FROM book;

SELECT word, COUNT(1) AS wc


FROM cbook
GROUP BY word
ORDER BY wc DESC
LIMIT 5
11. How joins are performed between Tables in Hive?

Consider two small demonstration tables, sales (which lists the names of people
and the IDs of the items they bought) and things (which lists the item IDs and
their names):

An inner join on the two tables can be performed as follows:

OUTER JOINS : Outer joins helps to find mismatches between the tuples of
the tables being joined. If the join type is LEFT OUTER JOIN, the query
will return a row for every row in the left table (sales), even if there is
no corresponding row in the table it is being joined to (things):

If the join type is RIGHT OUTER JOIN, the query will return a row for every
row in the right table (things), even if there is no corresponding row in
the table it is being joined to (sales):

Hive also supports full outer joins, where a tuple in each table is placed
in the output whether are not there is a matched tuple in other table:

SEMI JOINS : This join operation is used to filter tuples in one table

based on condition defined on other table. For example to find all the
items in the things table that are in the sales table a semi join can be
performed between tables as follows:
package util.pig.it.bec;

import java.io.IOException;
import java.util.List;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

public class CutLoadFunc extends LoadFunc {

private static final Log LOG = LogFactory.getLog(CutLoadFunc.class);

private final List<Range> ranges;

private final TupleFactory tupleFactory = TupleFactory.getInstance();

private RecordReader reader;

public CutLoadFunc(String cutPattern) {

ranges = Range.parse(cutPattern);

@Override
public void setLocation(String location, Job job)

throws IOException {

FileInputFormat.setInputPaths(job, location);

@Override
public InputFormat getInputFormat() {

return new TextInputFormat();

@Override
public void prepareToRead(RecordReader reader, PigSplit split) {

this.reader = reader;

@Override
public Tuple getNext() throws IOException {
try {

if (!reader.nextKeyValue()) {

return null;

Text value = (Text) reader.getCurrentValue();

String line = value.toString();

Tuple tuple = tupleFactory.newTuple(ranges.size());

for (int i = 0; i < ranges.size(); i++) {

Range range = ranges.get(i);

if (range.getEnd() > line.length()) {

LOG.warn(String.format(

"Range end (%s) is longer than line length (%s)",

range.getEnd(), line.length()));

continue;

tuple.set(i, new DataByteArray(range.getSubstring(line)));

return tuple;

} catch (InterruptedException e) {

throw new ExecException(e);

}
package util.pig.it.bec;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.FilterFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;

public class IsGoodQuality extends FilterFunc {

@Override
public Boolean exec(Tuple tuple) throws IOException {

if (tuple == null || tuple.size() == 0) {

return false;

try {

Object object = tuple.get(0);

if (object == null) {

return false;

int i = (Integer) object;

return i == 0 || i == 1 || i == 4 || i == 5 || i == 9;

} catch (ExecException e) {

throw new IOException(e);

@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {

List<FuncSpec> funcSpecs = new ArrayList<FuncSpec>();

funcSpecs.add(new FuncSpec(this.getClass().getName(),

new Schema(new Schema.FieldSchema(null, DataType.INTEGER))));

return funcSpecs;

}
package util.pig.it.bec;
import java.util.ArrayList;

import java.util.Collections;

import java.util.List;

public class Range {

private final int start;

private final int end;

public Range(int start, int end) {

this.start = start;

this.end = end;

public int getStart() {

return start;

public int getEnd() {

return end;

public String getSubstring(String line) {

return line.substring(start - 1, end);

@Override
public int hashCode() {

return start * 37 + end;

@Override
public boolean equals(Object obj) {

if (!(obj instanceof Range)) {

return false;

Range other = (Range) obj;

return this.start == other.start && this.end == other.end;

public static List<Range> parse(String rangeSpec)


throws IllegalArgumentException {

if (rangeSpec.length() == 0) {

return Collections.emptyList();

List<Range> ranges = new ArrayList<Range>();

String[] specs = rangeSpec.split(",");

for (String spec : specs) {

String[] split = spec.split("-");

try {

ranges.add(new Range(Integer.parseInt(split[0]),
Integer.parseInt(split[1])));

} catch (NumberFormatException e) {

throw new IllegalArgumentException(e.getMessage());

return ranges;

}
package edu.bec.it.abd.se

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object wordcount {
def main(args: Array[String]): Unit = {

if(args.length < 2) {
println("Missing arguments")
}
val conf = new SparkConf()
.setAppName("Word Count")
.setMaster("local[*]")

val sc = new SparkContext(conf)


val iptext = sc.textFile(args(0)).map(rl => rl.trim()).map(rl => rl.toLowerCase())
val words = iptext.flatMap(line => line.split(" "))
val iwc = words.map(word => (word, 1))
val cwc = iwc.reduceByKey((v1,v2) => v1+v2)
val swc = cwc.sortByKey(true)
swc.foreach(println)
swc.saveAsTextFile(args(1))

}
}
package edu.bec.it.hudf;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

public class Strip extends UDF {


private Text result = new Text();

public Text evaluate(Text str, int si, int ei) {


if (str == null) {
return null;
}
result.set(StringUtils.substring(str.toString(), si, ei));
return result;
}

public Text evaluate(Text str) {


if (str == null) {
return null;
}
result.set(StringUtils.strip(str.toString()));
return result;
}

public Text evaluate(Text str, String stripChars) {


if (str == null) {
return null;
}
result.set(StringUtils.strip(str.toString(), stripChars));
return result;
}
}
package edu.bec.it.hudf;

import org.apache.hadoop.hive.ql.exec. UDAF;


import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.FloatWritable;

public class Mean extends UDAF {


public static class MeanFloatUDAFEvaluator implements UDAFEvaluator {
public static class PartialResult {
float sum;
long count;
}

private PartialResult partial;

public void init() {


partial = null;
}

public boolean iterate(FloatWritable value) {


if (value == null) {
return true;
}
if (partial == null) {
partial = new PartialResult();
}
partial.sum += value.get();
partial.count++;
return true;
}

public PartialResult terminatePartial() {


return partial;
}

public boolean merge(PartialResult other) {


if (other == null) {
return true;
}
if (partial == null) {
partial = new PartialResult();
}
partial.sum += other.sum;
partial.count += other.count;
return true;
}

public FloatWritable terminate() {


if (partial == null) {
return null;
}
return new FloatWritable(partial.sum / partial.count);
}
}

public static class MeanIntUDAFEvaluator implements UDAFEvaluator {


public static class PartialResult {
int sum;
long count;
}

private PartialResult partial;


public void init() {
partial = null;
}

public boolean iterate(FloatWritable value) {


if (value == null) {
return true;
}
if (partial == null) {
partial = new PartialResult();
}
partial.sum += value.get();
partial.count++;
return true;
}

public PartialResult terminatePartial() {


return partial;
}

public boolean merge(PartialResult other) {


if (other == null) {
return true;
}
if (partial == null) {
partial = new PartialResult();
}
partial.sum += other.sum;
partial.count += other.count;
return true;
}

public FloatWritable terminate() {


if (partial == null) {
return null;
}
return new FloatWritable(partial.sum / partial.count);
}
}

}
Write HiveQL script to find out top K frequent words in a given set of text files.
(Note: Add description to the code where ever it is required)
Describe complex data types array,map and struct in Hive with an example script.

How joins are performed between Tables in Hive?

Write about COGROUP in Hive

How table partitioning improves Hive query performance?

Describe Sqoop import command in detail.

Potrebbero piacerti anche