mapreduce案例

1. 统计平均分¶

请编写Python程序，利用Hadoop Streaming工具对输入文件中数据计算学生平均成绩。输入文件中的每行内容均为一个学生的姓名和他相应的成绩，如果有多门学科，则每门学科为一个文件。要求在输出中每行有两个间隔的数据，其中，第一个代表学生的姓名，第二个代表其平均成绩。下面是输入文件和输出文件的一个样例供参考。

[root@master demo4]# cat chinese.txt  
张三 80

李四 78

王五 92
[root@master demo4]# cat english.txt 
张三 97

李四 94

王五 88
[root@master demo4]# cat math.txt 
张三 88

李四 86

王五 90

代码

[root@master demo4]# cat map.py 

#!/usr/bin/python

# -*- coding: utf-8 -*-

import sys
import time 
for line in sys.stdin:
    line = line.strip()
    words = line.split()
    #print line
    #print words
    #time.sleep(1)
    if not words: continue
    print '%s\t%s' % (words[0],words[1])
[root@master demo4]# cat red.py 
#!/usr/bin/python
# -*- coding: utf-8 -*-
import time 
import sys
count=0
i=0
sum=0
for line in sys.stdin:
    line = line.strip()
    name,score = line.split('\t', 1)
    #print name,score
    #tme.sleep(1)
    if i==0:
       current_name=name
       i=1
    try:
       score = int(score)
    except ValueError:
        continue
    if current_name == name:
       count += 1
       sum +=score
    else:
       print '%s\t%s' % (current_name,sum/count)
       current_name=name
       sum=score
       count=1

print '%s\t%s' % (current_name,sum/count)

[root@master demo4]# cat run.sh 

HADOOP_CMD="/usr/local/hadoop-2.6.5/bin/hadoop"
STREAM_JAR_PATH="/usr/local/hadoop-2.6.5/share/hadoop/tools/lib/hadoop-streaming-2.6.5.jar"

#INPUT_FILE_PATH_1="/The_Man_of_Property.txt"
INPUT_FILE_PATH_1="/score/*"
OUTPUT_PATH="/output"

$HADOOP_CMD fs -rmr -skipTrash $OUTPUT_PATH

# Step 1.
$HADOOP_CMD jar $STREAM_JAR_PATH \
    -input $INPUT_FILE_PATH_1 \
    -output $OUTPUT_PATH \
    -mapper "python map.py" \
    -reducer "python red.py" \
    -file ./map.py \
    -file ./red.py

手动模拟

[root@master demo4]# cat *.txt|python map.py |sort -k1|python red.py 
张三  88
李四  86
王五  90

运行

首先将文件上传到hdfs
[root@master demo4]# hadoop fs -put *.txt /score
put: `/score/chinese.txt': File exists
put: `/score/english.txt': File exists
put: `/score/math.txt': File exists

[root@master demo4]# sh run.sh 
rmr: DEPRECATED: Please use 'rm -r' instead.
Deleted /output
19/07/04 17:51:48 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [./map.py, ./red.py, /tmp/hadoop-unjar3934570659916298598/] [] /tmp/streamjob6371931224936925713.jar tmpDir=null
19/07/04 17:51:49 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.186.10:8032
19/07/04 17:51:49 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.186.10:8032
19/07/04 17:51:50 INFO mapred.FileInputFormat: Total input paths to process : 3
19/07/04 17:51:50 INFO mapreduce.JobSubmitter: number of splits:3
19/07/04 17:51:50 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1561818301923_0050
19/07/04 17:51:50 INFO impl.YarnClientImpl: Submitted application application_1561818301923_0050
19/07/04 17:51:50 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1561818301923_0050/
19/07/04 17:51:50 INFO mapreduce.Job: Running job: job_1561818301923_0050
19/07/04 17:51:56 INFO mapreduce.Job: Job job_1561818301923_0050 running in uber mode : false
19/07/04 17:51:56 INFO mapreduce.Job:  map 0% reduce 0%
19/07/04 17:52:04 INFO mapreduce.Job:  map 100% reduce 0%
19/07/04 17:52:09 INFO mapreduce.Job:  map 100% reduce 100%
19/07/04 17:52:09 INFO mapreduce.Job: Job job_1561818301923_0050 completed successfully
19/07/04 17:52:09 INFO mapreduce.Job: Counters: 50
    File System Counters
        FILE: Number of bytes read=114
        FILE: Number of bytes written=442361
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=357
        HDFS: Number of bytes written=30
        HDFS: Number of read operations=12
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=2
    Job Counters 
        Killed map tasks=1
        Launched map tasks=3
        Launched reduce tasks=1
        Data-local map tasks=3
        Total time spent by all maps in occupied slots (ms)=17263
        Total time spent by all reduces in occupied slots (ms)=2977
        Total time spent by all map tasks (ms)=17263
        Total time spent by all reduce tasks (ms)=2977
        Total vcore-milliseconds taken by all map tasks=17263
        Total vcore-milliseconds taken by all reduce tasks=2977
        Total megabyte-milliseconds taken by all map tasks=17677312
        Total megabyte-milliseconds taken by all reduce tasks=3048448
    Map-Reduce Framework
        Map input records=15
        Map output records=9
        Map output bytes=90
        Map output materialized bytes=126
        Input split bytes=261
        Combine input records=0
        Combine output records=0
        Reduce input groups=3
        Reduce shuffle bytes=126
        Reduce input records=9
        Reduce output records=3
        Spilled Records=18
        Shuffled Maps =3
        Failed Shuffles=0
        Merged Map outputs=3
        GC time elapsed (ms)=619
        CPU time spent (ms)=4380
        Physical memory (bytes) snapshot=902430720
        Virtual memory (bytes) snapshot=8453459968
        Total committed heap usage (bytes)=670040064
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=96
    File Output Format Counters 
        Bytes Written=30
19/07/04 17:52:09 INFO streaming.StreamJob: Output directory: /output

[root@master demo4]# hadoop fs -text /output/part-00000
张三  88
李四  86
王五  90

2. 共同好友¶

好友数据

[root@master cmz]# cat fdatas.txt 
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

数据格式本人:好友1,好友2,...., 求每两人的共同好友

map.py

#-*-encoding:utf-8-*-
import sys
import time

result = {}
for line in sys.stdin:
    line = line.strip()
    if len(line)==0: continue
    myself,vals = line.split(':')
    result[myself] = vals.split(',')

    for myself_i in result:
        comm = set(result[myself_i]) & set(result[myself]) 
        if myself == myself_i or len(comm)==0 : continue
        print myself+myself_i,"\t",''.join(list(comm))

map的思路

1. 首先是拆分 A:B,C,D,F,E,O这样样的输入为 {'A': ['B', 'C', 'D', 'F', 'E', 'O']}
   每新增一行数据就把数据装入这个字典，最后这个字典就是以本人为key，本人的还有为value的模式

2. 首先读取文件的第一行[fdatas.txt]，第一行输入，
    result[key]=val 也就是 result[A]={'A': ['B', 'C', 'D', 'F', 'E', 'O']}
    result={'A': ['B', 'C', 'D', 'F', 'E', 'O']}

3. 读取第二行的时候，
     result[key]=val 也就是 result[B]={'A': [’A','C','E','K']}
     result={'A': ['B', 'C', 'D', 'F', 'E', 'O'], 'B': ['A', 'C', 'E', 'K']}
4. 读完第二行后开始计算，
    取当前的result[key]所有数据(也就是['A', 'C', 'E', 'K'])和result({'A': ['B', 'C', 'D', 'F', 'E', 'O'], 'B': ['A', 'C', 'E', 'K']})每一个数字做对比。

red.py

#-*-encoding:utf-8-*-
import sys
result = {}
for line in sys.stdin:
    line = line.strip()
    k,v = line.split('\t')
    if k in result:
        result[k].append(v)
    else:
        result[k] = [v]
for key,val in result.items():
    print key,"共同好友是:",val

run.sh

HADOOP_CMD="/usr/local/hadoop-2.6.5/bin/hadoop"
STREAM_JAR_PATH="/usr/local/hadoop-2.6.5/share/hadoop/tools/lib/hadoop-streaming-2.6.5.jar"
INPUT_FILE_PATH="/fdatas.txt"
#OUTPUT_PATH="/fdatas_output"
OUTPUT_PATH="/cc"
$HADOOP_CMD fs -rmr -skipTrash $OUTPUT_PATH
$HADOOP_CMD jar $STREAM_JAR_PATH \
-input $INPUT_FILE_PATH \
-output $OUTPUT_PATH \
-mapper 'python map.py' \
-reducer 'python red.py' \
-jobconf mapred.min.split.size=128000000 \
-file *.py

运行

[root@master cmz]# ls
fdatas.txt  map.py  red.py  run.sh
[root@master cmz]# sh run.sh 
rmr: DEPRECATED: Please use 'rm -r' instead.
Deleted /cc
19/07/11 10:49:00 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
19/07/11 10:49:00 WARN streaming.StreamJob: -jobconf option is deprecated, please use -D instead.
19/07/11 10:49:00 INFO Configuration.deprecation: mapred.min.split.size is deprecated. Instead, use mapreduce.input.fileinputformat.split.minsize
packageJobJar: [map.py, red.py, /tmp/hadoop-unjar979869920241111383/] [] /tmp/streamjob3948242446941020213.jar tmpDir=null
19/07/11 10:49:00 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.186.10:8032
19/07/11 10:49:01 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.186.10:8032
19/07/11 10:49:02 INFO mapred.FileInputFormat: Total input paths to process : 1
19/07/11 10:49:02 INFO mapreduce.JobSubmitter: number of splits:1
19/07/11 10:49:03 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1562812999138_0001
19/07/11 10:49:03 INFO impl.YarnClientImpl: Submitted application application_1562812999138_0001
19/07/11 10:49:03 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1562812999138_0001/
19/07/11 10:49:03 INFO mapreduce.Job: Running job: job_1562812999138_0001
19/07/11 10:49:09 INFO mapreduce.Job: Job job_1562812999138_0001 running in uber mode : false
19/07/11 10:49:09 INFO mapreduce.Job:  map 0% reduce 0%
19/07/11 10:49:14 INFO mapreduce.Job:  map 100% reduce 0%
19/07/11 10:49:20 INFO mapreduce.Job:  map 100% reduce 100%
19/07/11 10:49:20 INFO mapreduce.Job: Job job_1562812999138_0001 completed successfully
19/07/11 10:49:21 INFO mapreduce.Job: Counters: 49
    File System Counters
        FILE: Number of bytes read=671
        FILE: Number of bytes written=222363
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=223
        HDFS: Number of bytes written=2145
        HDFS: Number of read operations=6
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=2
    Job Counters 
        Launched map tasks=1
        Launched reduce tasks=1
        Data-local map tasks=1
        Total time spent by all maps in occupied slots (ms)=2321
        Total time spent by all reduces in occupied slots (ms)=3110
        Total time spent by all map tasks (ms)=2321
        Total time spent by all reduce tasks (ms)=3110
        Total vcore-milliseconds taken by all map tasks=2321
        Total vcore-milliseconds taken by all reduce tasks=3110
        Total megabyte-milliseconds taken by all map tasks=2376704
        Total megabyte-milliseconds taken by all reduce tasks=3184640
    Map-Reduce Framework
        Map input records=14
        Map output records=74
        Map output bytes=517
        Map output materialized bytes=671
        Input split bytes=81
        Combine input records=0
        Combine output records=0
        Reduce input groups=74
        Reduce shuffle bytes=671
        Reduce input records=74
        Reduce output records=74
        Spilled Records=148
        Shuffled Maps =1
        Failed Shuffles=0
        Merged Map outputs=1
        GC time elapsed (ms)=126
        CPU time spent (ms)=1500
        Physical memory (bytes) snapshot=441380864
        Virtual memory (bytes) snapshot=4230311936
        Total committed heap usage (bytes)=281018368
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=142
    File Output Format Counters 
        Bytes Written=2145
19/07/11 10:49:21 INFO streaming.StreamJob: Output directory: /cc

[root@master friends]# hadoop fs -ls /cc
Found 2 items
-rw-r--r--   3 root supergroup          0 2019-07-11 10:49 /cc/_SUCCESS
-rw-r--r--   3 root supergroup       2145 2019-07-11 10:49 /cc/part-00000

[root@master friends]# hadoop fs -text /cc/*
LE  共同好友是: ['D']    
CB  共同好友是: ['A']    
KA  共同好友是: ['CD']   
EB  共同好友是: ['C']    
KC  共同好友是: ['AD']   
DB  共同好友是: ['AE']   
LG  共同好友是: ['EDF']  
LA  共同好友是: ['EDF']  
ED  共同好友是: ['L']    
JE  共同好友是: ['B']    
KE  共同好友是: ['CD']   
GD  共同好友是: ['AEF']  
KG  共同好友是: ['ACD']  
FB  共同好友是: ['ACE']  
OB  共同好友是: ['A']    
LC  共同好友是: ['DF']   
GF  共同好友是: ['ACED'] 
KI  共同好友是: ['A']    
IH  共同好友是: ['AO']   
OH  共同好友是: ['A']    
GC  共同好友是: ['ADF']  
ML  共同好友是: ['EF']   
FE  共同好友是: ['CBMD'] 
LH  共同好友是: ['ED']   
HA  共同好友是: ['CEDO'] 
IC  共同好友是: ['A']    
IA  共同好友是: ['O']    
MH  共同好友是: ['E']    
OG  共同好友是: ['A']    
HC  共同好友是: ['AD']   
MG  共同好友是: ['EF']   
HE  共同好友是: ['CD']   
IG  共同好友是: ['A']    
OD  共同好友是: ['A']    
MD  共同好友是: ['EF']   
HG  共同好友是: ['ACED'] 
MF  共同好友是: ['E']    
JH  共同好友是: ['O']    
DA  共同好友是: ['EF']   
LF  共同好友是: ['ED']   
LD  共同好友是: ['EF']   
EC  共同好友是: ['D']    
OK  共同好友是: ['A']    
EA  共同好友是: ['CBD']  
KB  共同好友是: ['AC']   
KD  共同好友是: ['A']    
OF  共同好友是: ['A']    
DC  共同好友是: ['AF']   
FC  共同好友是: ['AD']   
LB  共同好友是: ['E']    
GE  共同好友是: ['CD']   
KF  共同好友是: ['ACD']  
KH  共同好友是: ['ACD']  
FA  共同好友是: ['CBEDO']    
MC  共同好友是: ['F']    
MB  共同好友是: ['E']    
GA  共同好友是: ['CEDF'] 
FD  共同好友是: ['AE']   
JI  共同好友是: ['O']    
GB  共同好友是: ['ACE']  
MA  共同好友是: ['EF']   
IB  共同好友是: ['A']    
OC  共同好友是: ['AI']   
JF  共同好友是: ['BO']   
HB  共同好友是: ['ACE']  
LK  共同好友是: ['D']    
OI  共同好友是: ['A']    
IF  共同好友是: ['AO']   
JA  共同好友是: ['BO']   
BA  共同好友是: ['CE']   
HD  共同好友是: ['AE']   
HF  共同好友是: ['ACEDO']    
CA  共同好友是: ['DF']   
ID  共同好友是: ['A']    
[root@master friends]# hadoop fs -text /cc/*|wc -l
74

注意点

1. mapreduce添加参数
    -jobconf mapred.min.split.size=128000000 保证这个任务只有一个map。源文件不能被切分, ==因为切分后，文件之间好友就没有关系了== 。

2. 输出 任务个数
     19/07/11 10:49:02 INFO mapreduce.JobSubmitter: number of splits:1

3. 使用hadoop streaming进行用户流量分析¶

本文是用python语言并使用hadoop中的streaming来对用户数据进行分析，统计用户的手机号码、上行流量、下行流量、总流量的信息。

一、待分析的数据源

文本文件内容，里面有非常多的用户浏览信息，包括用户手机号码，上网时间，机器序列号，访问的IP，访问的网站，上行流量，下行流量，总流量等信息。(倒数第三列是上传流量，倒数第二列是下载流量，第二列为手机号)

[root@master abc]# cat data.txt 

1363157985066   13726230503 00-FD-07-A4-72-B8:CMCC  120.196.100.82  i02.c.aliimg.com        24  27  2481    24681   200
1363157995052   13826544101 5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4            4   0   264 0   200
1363157991076   13926435656 20-10-7A-28-CC-0A:CMCC  120.196.100.99          2   4   132 1512    200
1363154400022   13926251106 5C-0E-8B-8B-B1-50:CMCC  120.197.40.4            4   0   240 0   200
1363157993044   18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99  iface.qiyi.com  ????    15  12  1527    2106    200
1363157995074   84138413    5C-0E-8B-8C-E8-20:7DaysInn  120.197.40.4    122.72.52.12        20  16  4116    1432    200
1363157993055   13560439658 C4-17-FE-BA-DE-D9:CMCC  120.196.100.99          18  15  1116    954 200
1363157995033   15920133257 5C-0E-8B-C7-BA-20:CMCC  120.197.40.4    sug.so.360.cn   ????    20  20  3156    2936    200
1363157983019   13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82          4   0   240 0   200
1363157984041   13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4    s19.cnzz.com    ????    24  9   6960    690 200
1363157973098   15013685858 5C-0E-8B-C7-F7-90:CMCC  120.197.40.4    rank.ie.sogou.com   ????    28  27  3659    3538    200
1363157986029   15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99  www.umeng.com   ????    3   3   1938    180 200
1363157992093   13560439658 C4-17-FE-BA-DE-D9:CMCC  120.196.100.99          15  9   918 4938    200
1363157986041   13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4            3   3   180 180 200
1363157984040   13602846565 5C-0E-8B-8B-B6-00:CMCC  120.197.40.4    2052.flash2-http.qq.com ????    15  12  1938    2910    200
1363157995093   13922314466 00-FD-07-A2-EC-BA:CMCC  120.196.100.82  img.qfc.cn      12  12  3008    3720    200
1363157982040   13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99  y0.ifengimg.com ????    57  102 7335    110349  200
1363157986072   18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99  input.shouji.sogou.com  ????    21  18  9531    2412    200
1363157990043   13925057413 00-1F-64-E1-E6-9A:CMCC  120.196.100.55  t3.baidu.com    ????    69  63  11058   48243   200
1363157988072   13760778710 00-FD-07-A4-7B-08:CMCC  120.196.100.82          2   2   120 120 200
1363157985066   13726238888 00-FD-07-A4-72-B8:CMCC  120.196.100.82  i02.c.aliimg.com        24  27  2481    24681   200
1363157993055   13560436666 C4-17-FE-BA-DE-D9:CMCC  120.196.100.99          18  15  1116    954 200

[root@master abc]# cat run.sh 
HADOOP_CMD="/usr/local/hadoop-2.6.5/bin/hadoop"
STREAM_JAR_PATH="/usr/local/hadoop-2.6.5/share/hadoop/tools/lib/hadoop-streaming-2.6.5.jar"
INPUT_FILE_PATH="/data.txt"
#OUTPUT_PATH="/fdatas_output"
OUTPUT_PATH="/cc"
$HADOOP_CMD fs -rmr -skipTrash $OUTPUT_PATH
$HADOOP_CMD jar $STREAM_JAR_PATH \
-input $INPUT_FILE_PATH \
-output $OUTPUT_PATH \
-mapper 'python flow.py mapper' \
-reducer 'python flow.py reducer' \
-file *.py
[root@master abc]# cat flow.py 
#!/usr/bin/python
# -*- coding:utf-8 -*-

"""
@author:caimengzhi
@file:flow.py
@time: PM 2019-07-17
"""

import sys
import gc
import logging

#使用日志模块打印错误日志
logger = logging.getLogger()
logfile = 'flow.log'
hdlr = logging.FileHandler('sendlog.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)

gc.disable()

def mapper():
    for line in sys.stdin:
        dataline = line.rstrip().split('\t')
        if len(dataline) == 9 or 10:
            try:
                #获得手机号，上传流量，下载流量，汇总流量
                phone = dataline[1]
                upFlow = dataline[len(dataline) - 3]
                downFlow = dataline[len(dataline) - 2]
                sumFlow = int(upFlow)+int(downFlow)
            except:
                continue
            print '%s\t%s\t%s\t%s' % (phone, upFlow, downFlow, sumFlow) #手机号，上传流量，下载流量，汇总流量

def reducer():
    current_newphone = None
    current_newUpFolw = 0
    current_newDownFlow = 0
    current_newSumFlow = 0
    newphone = None

    for line in sys.stdin:
        newdataline = line.rstrip().split('\t') #截取数据
        newphone = newdataline[0]
        newUpFolw = newdataline[1]
        newDownFlow = newdataline[2]
        newSumFlow = newdataline[3]
        # print '%s,%s,%s' %(newphone,newUpFolw,newDownFlow)
        #上传流量、下载流量、总流量分别求和
        if current_newphone == newphone:
            current_newUpFolw = int(current_newUpFolw) + int(newUpFolw)
            current_newDownFlow = int(current_newDownFlow) + int(newDownFlow)
            current_newSumFlow = int(current_newSumFlow) + int(newSumFlow)
            # print '%s,%s,%s' % (current_newphone, current_newUpFolw, current_newDownFlow)
        else:
            if current_newUpFolw:
                 print '%s,%s,%s,%s' % (current_newphone, current_newUpFolw, current_newDownFlow,current_newSumFlow)
            current_newphone = newphone
            current_newUpFolw = newUpFolw
            current_newDownFlow = newDownFlow
            current_newSumFlow = newSumFlow
    #输出最后一行数据
    if current_newphone == newphone:
          print '%s,%s,%s,%s' % (current_newphone, current_newUpFolw, current_newDownFlow,current_newSumFlow)

d = {'mapper': mapper, 'reducer': reducer}
if sys.argv[1] in d:
    d[sys.argv[1]]()

执行过程

[root@master abc]# hadoop fs -put data.txt /
[root@master abc]# sh run.sh 
rmr: DEPRECATED: Please use 'rm -r' instead.
Deleted /cc
19/07/17 13:59:50 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [flow.py, /tmp/hadoop-unjar4908725997945227088/] [] /tmp/streamjob5354398154324950355.jar tmpDir=null
19/07/17 13:59:51 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.186.10:8032
19/07/17 13:59:51 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.186.10:8032
19/07/17 13:59:51 INFO mapred.FileInputFormat: Total input paths to process : 1
19/07/17 13:59:51 INFO mapreduce.JobSubmitter: number of splits:2
19/07/17 13:59:52 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1562812999138_0002
19/07/17 13:59:52 INFO impl.YarnClientImpl: Submitted application application_1562812999138_0002
19/07/17 13:59:52 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1562812999138_0002/
19/07/17 13:59:52 INFO mapreduce.Job: Running job: job_1562812999138_0002
19/07/17 13:59:58 INFO mapreduce.Job: Job job_1562812999138_0002 running in uber mode : false
19/07/17 13:59:58 INFO mapreduce.Job:  map 0% reduce 0%
19/07/17 14:00:08 INFO mapreduce.Job:  map 100% reduce 0%
19/07/17 14:00:14 INFO mapreduce.Job:  map 100% reduce 100%
19/07/17 14:00:14 INFO mapreduce.Job: Job job_1562812999138_0002 completed successfully
19/07/17 14:00:14 INFO mapreduce.Job: Counters: 49
    File System Counters
        FILE: Number of bytes read=626
        FILE: Number of bytes written=332034
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=3361
        HDFS: Number of bytes written=572
        HDFS: Number of read operations=9
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=2
    Job Counters 
        Launched map tasks=2
        Launched reduce tasks=1
        Data-local map tasks=2
        Total time spent by all maps in occupied slots (ms)=14931
        Total time spent by all reduces in occupied slots (ms)=4158
        Total time spent by all map tasks (ms)=14931
        Total time spent by all reduce tasks (ms)=4158
        Total vcore-milliseconds taken by all map tasks=14931
        Total vcore-milliseconds taken by all reduce tasks=4158
        Total megabyte-milliseconds taken by all map tasks=15289344
        Total megabyte-milliseconds taken by all reduce tasks=4257792
    Map-Reduce Framework
        Map input records=23
        Map output records=22
        Map output bytes=576
        Map output materialized bytes=632
        Input split bytes=158
        Combine input records=0
        Combine output records=0
        Reduce input groups=21
        Reduce shuffle bytes=632
        Reduce input records=22
        Reduce output records=21
        Spilled Records=44
        Shuffled Maps =2
        Failed Shuffles=0
        Merged Map outputs=2
        GC time elapsed (ms)=837
        CPU time spent (ms)=4990
        Physical memory (bytes) snapshot=711663616
        Virtual memory (bytes) snapshot=6342242304
        Total committed heap usage (bytes)=473956352
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=3203
    File Output Format Counters 
        Bytes Written=572
19/07/17 14:00:14 INFO streaming.StreamJob: Output directory: /cc
[root@master abc]# hadoop fs -text /cc/*
13480253104,180,180,360 
13502468823,7335,110349,117684  
13560436666,1116,954,2070   
13560439658,2034,5892,7926  
13602846565,1938,2910,4848  
13660577991,6960,690,7650   
13719199419,240,0,240   
13726230503,2481,24681,27162    
13726238888,2481,24681,27162    
13760778710,120,120,240 
13826544101,264,0,264   
13922314466,3008,3720,6728  
13925057413,11058,48243,59301   
13926251106,240,0,240   
13926435656,132,1512,1644   
15013685858,3659,3538,7197  
15920133257,3156,2936,6092  
15989002119,1938,180,2118   
18211575961,1527,2106,3633  
18320173382,9531,2412,11943 
84138413,4116,1432,5548