forked from itratrahman/mapreduce_with_mrjobs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
5WordFrequecy.py
69 lines (47 loc) · 2.14 KB
/
5WordFrequecy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
####This source code gives an example demonstration of how to word frequncy from a book
##import statements
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
##Regular expression pattern required for matching every line of the file
WORD_REGEXP = re.compile(r"[\w']+")
##The MapReduce class which is passed an object of MRJob
class WordFrequency(MRJob):
##Function to define the multistep MapReduce jobs
def steps(self):
return [
##Mapper and Reducer functions of 1st step of the MapReduce job
MRStep(mapper=self.mapper_get_words,
reducer=self.reducer_count_words),
##Mapper and Reducer functions of 2nd step of the MapReduce job
MRStep(mapper=self.mapper_make_counts_key,
reducer = self.reducer_output_words)
]
##Mapper function groups and sorts the words in the file based on their count
def mapper_get_words(self, _, line):
##Each line in the file is a paragraph
##Return all non-overlapping matches of pattern in the line
words = WORD_REGEXP.findall(line)
##Iterating through every word in the list
for word in words:
word = unicode(word, "utf-8", errors="ignore") #avoids issues in mrjob 5.0
##Yielding the word (all letter lowercased), and 1
yield word.lower(), 1
##The reducer function outputs the words and their number of occurences
def reducer_count_words(self, word, values):
##Yielding the word, and its number of occurences
yield word, sum(values)
##The mapper function groups and sorts the resulting counts from the 1st MapReduce job based on their words
def mapper_make_counts_key(self, words, count):
##Yielding the count (value rounded to 4 decimal places and padded with leading 0s) and the corresponding list of words
yield '%04d'%int(count), words
##The reducer function outputs the counts and their correspoonding words
def reducer_output_words(self, count, words):
##Iterting through the list of words in the word
for word in words:
##Yielding the count the word
yield count, word
##The main method of the sourcecode
if __name__ == '__main__':
##Running the map reduce class above
WordFrequency.run()