forked from infochimps-labs/wukong
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsize.rb
executable file
·60 lines (53 loc) · 1.25 KB
/
size.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env ruby
$: << File.dirname(__FILE__)+'/../lib'
require 'wukong'
module Size
#
# Feed the entire dataset through wc and sum the results
#
class Script < Wukong::Script
#
# Don't implement a wukong script to do something if there's a unix command
# that does it faster: just override map_command or reduce_command in your
# subclass of Wukong::Script to return the complete command line
#
def map_command
'/usr/bin/wc'
end
# Make all records go to one reducer
def default_options
super.merge :reduce_tasks => 1
end
end
#
# Sums the numeric value of each column in its input
#
class Reducer < Wukong::Streamer::Base
attr_accessor :sums
#
# The unix +wc+ command uses whitespace, not tabs, so we'll recordize
# accordingly.
#
def recordize line
line.strip.split(/\s+/)
end
#
# add each corresponding column in the input
#
def process *vals
self.sums = vals.zip( sums || [] ).map{|val,sum| val.to_i + sum.to_i }
end
#
# run through the whole reduction input and then output the total
#
def stream *args
super *args
emit sums
end
end
end
# Execute the script
Size::Script.new(
nil,
Size::Reducer
).run