-
Notifications
You must be signed in to change notification settings - Fork 0
/
Processing.py
36 lines (27 loc) · 1.24 KB
/
Processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""
This script is used to:
1. Remove all reviews with a rating of 3.
2. Keep words comprised of only alphabets. Any words containing numbers or other characters are removed.
3. Save the resulting dataframe to a parquet file.
TODO: Convert ratings (1,2,4,5) into sentiment at this stage.
TODO: Create reviewID = asin+'-'+'reviewerID' at this stage.
TODO: Split into stratified train-test dataframes.
"""
from pyspark.sql.functions import udf
from CategoryGenerator import CategoryGenerator
import re
def removePunctuation(text):
return re.sub('[^a-z ]', " ", text.lower().strip())
puncUDF = udf(removePunctuation)
def rating_to_sentiment(rating):
return rating > 3
ratingUDF = udf(rating_to_sentiment, BooleanType())
categorygenerator = CategoryGenerator(bucket='pysenti-data',file_path='review-urls/reviews.txt')
for category in categorygenerator:
print(category.name)
amazon = spark.read.json(category.reviews_downloaded_json)
amazon = amazon.filter(amazon.overall!=3)
amazon = amazon.withColumn('cleanedReview',puncUDF('reviewText'))
amazon = amazon.withColumn('target', ratingUDF('overall'))
amazon = amazon.select('asin','reviewerID','cleanedReview','target')
amazon.write.parquet(category.reviews_processed_parquet)