Documenti di Didattica
Documenti di Professioni
Documenti di Cultura
wget https://www.apache.org/dist/flume/stable/apache-flume-1.6.0-bin.tar.gz
https://drive.google.com/file/d/0B_t6uqPmWadsdWJNQ0NjaXBUYUk/view?usp=sharin
g
cp flume-sources-1.0-SNAPSHOT.jar $FLUME_HOME/lib
2.5 Modify flume-env.sh file Paste given lines at the end of the file.
export JAVA_HOME=/usr/local/java/jdk1.8.0_144
FLUME_CLASSPATH="/home/hadoop/work/apache-flume-1.6.0-bin/lib/flume-sources-
1.0-SNAPSHOT.jar"
2.6 Download consumerKey, consumerSecret, accessTokenSecret
from https://apps.twitter.com/ which can be accessed from your twitter
developer account by creating a simple app. See here how to create a twitter
app: https://www.youtube.com/watch?v=xqSp7060Gj0
2.7 Create flume-twitter.conf file in conf folder and paste given lines
TwitterAgent.sources = Twitter
TwitterAgent.channels = MemChannel
TwitterAgent.sinks = HDFS
TwitterAgent.sources.Twitter.type = com.cloudera.flume.source.TwitterSource
TwitterAgent.sources.Twitter.channels = MemChannel
TwitterAgent.sources.Twitter.consumerKey = xxxxxxxxxxxxxxxxxxxxxxxx
TwitterAgent.sources.Twitter.consumerSecret =
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
TwitterAgent.sources.Twitter.accessToken = xxxxxxxx-
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
TwitterAgent.sources.Twitter.accessTokenSecret =
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
This is where you give keywords to be fetched from twitter. Replace spark,
flink with your desired keyword.
TwitterAgent.sinks.HDFS.type = hdfs
TwitterAgent.sinks.HDFS.hdfs.path = hdfs://localhost:9000/spark/
TwitterAgent.sinks.HDFS.hdfs.fileType = DataStream
TwitterAgent.sinks.HDFS.hdfs.writeFormat = Text
TwitterAgent.sinks.HDFS.hdfs.batchSize = 1000
TwitterAgent.sinks.HDFS.hdfs.rollSize = 0
TwitterAgent.sinks.HDFS.hdfs.rollCount = 10000
TwitterAgent.channels.MemChannel.type = memory
TwitterAgent.channels.MemChannel.capacity = 10000
TwitterAgent.channels.MemChannel.transactionCapacity = 10000
export FLUME_HOME=/usr/local/apache-flume-1.6.0-bin
export FLUME_CONF_DIR=$FLUME_HOME/conf
export FLUME_CLASS_PATH=$FLUME_CONF_DIR
export PATH=$FLUME_HOME/bin:$PATH
cd $FLUME_HOME/lib
2.10 and rename these 3 files. (All you need to do just change the extention of
these files from .jar to .org)
to
cd $FLUME_HOME
2.12 Enter this command to get twitter data continuously.
/usr/local/hadoop/sbin/start-dfs.sh /usr/local/hadoop/sbin/start-yarn.sh
/usr/local/hadoop/sbin/stop-dfs.sh /usr/local/hadoop/sbin/stop-yarn.sh
TwitterAgent.sinks.HDFS.channel = MemCh
TwitterAgent.sinks.HDFS.type = hdfs
TwitterAgent.sinks.HDFS.hdfs.path =
hdfs://localhost:9000/flume/Twitter/day_key=%Y%m%d/
TwitterAgent.sinks.HDFS.hdfs.fileType = DataStream
TwitterAgent.sinks.HDFS.hdfs.writeformat=Text
TwitterAgent.sinks.HDFS.hdfs.batchSize=1000
TwitterAgent.sinks.HDFS.hdfs.rollSize=0
TwitterAgent.sinks.HDFS.hdfs.rollCount=10000
TwitterAgent.sinks.HDFS.hdfs.rollInterval=600
capacity 100 The maximum number of events stored in the channel
TwitterAgent.channels.MemCh.type = memory
TwitterAgent.channels.MemCh.capacity = 10000
TwitterAgent.channels.MemCh.transactionCapacity = 1000