1環境基于 上一篇搭建
高可用分布式集群
2??官方提供MapReduce程序
#評估圓周率
cd /data/hadoop/share/hadoop/mapreduce/
hadoop jar hadoop-mapreduce-examples-3.4.0.jar pi 2 6
?
?
?3 實例項目分析1
#預分析的文件如,如單詞統計
#
#上傳文件到hdfs
hdfs dfs -put word.txt /test/01/
#可以先測試下,在運行計算
[root@master11 01]# cat word.txt | python m.py |python r.py
foo 1
quux 1
labs 1
foo 1
bar 1
quux 1
good 1
six 1
good 1
foo 2
quux 1
labs 1
foo 1
bar 1
quux 1
good 1
six 1
good 1
foo 1
# hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/m.py -reducer /data/test/01/r.py -input /test/01/ -output /test/output1/
?
?#拷貝文件到服務器本地
hadoop fs -copyToLocal /test/output1/part-00000 /root/part-00000
#查看
[root@master11 ~]# cat part-00000
bar 2
foo 6
good 4
labs 2
quux 4
six 2
[root@master11 01]# cat m.py
#!/usr/bin/env python import sys for line in sys.stdin: line = line.strip() words = line.split() for word in words: print '%s\t%s' % (word, 1)
[root@master11 01]# cat r.py
#!/usr/bin/env python from operator import itemgetter
import sys current_word = None
current_count = 0
word = None for line in sys.stdin: line = line.strip() word, count = line.split('\t', 1) try: count = int(count) except ValueError: continue if current_word == word: current_count += count else: if current_word: print '%s\t%s' % (current_word, current_count) current_count = count current_word = word if current_word == word: print '%s\t%s' % (current_word, current_count)
?
4項目實戰分析2,nginx日志 IP計數
?
hdfs dfs -put t.log /test/01/
#執行計算
hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/map.py -reducer /data/test/01/red.py -input /test/01/t.log -output /test/output2/
#下載
hadoop fs -copyToLocal /test/output2/part-00000 /root/part-00000
#查看
?
[root@master11 01]# cat map.py
#!/usr/bin/python
import sys
import re
for line in sys.stdin: ipaddress=re.compile(r'([\d.]*)')match=ipaddress.match(line) if match:ip=match.group(1)print ' %s\t%s' % (ip, 1)
[root@master11 01]# cat red.py
#!/usr/bin/python
#-*-coding:UTF-8 -*-
import sys
import os
import string
res = {}
for line in sys.stdin:skey=line[0:-1]if(res.has_key(skey)==False): res[skey]=0res[skey]=res[skey]+1
for key in res.keys():print key+"\t"+str(res[key])
5 ?歡迎同學們一起交流
?