-
Notifications
You must be signed in to change notification settings - Fork 7
/
CleanCSV.py
42 lines (35 loc) · 1.21 KB
/
CleanCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import csv, sys
# This script filters a csv file by the value in a specified column.
# The output file does not quote columns and uses the pipe delimiter.
# Command line as follows:
# python CleanCSV.py <input file name> <output file name> <col number> <filter value>
inputfile, outputfile = sys.argv[1], sys.argv[2]
if len(sys.argv) > 3:
filtercol=(int) (sys.argv[3])
filterval=sys.argv[4]
filter=True
else:
filtercol = 0
filterval = '*'
filter = False
writecount, readcount, fldnum = 0, 0, 0
print("Input File :", inputfile)
print("Output File:", outputfile)
with open(inputfile, mode='r') as csvinfile, \
open(outputfile, mode='w', newline='') as csvoutfile:
datawriter = csv.writer(csvoutfile, delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')
datareader = csv.reader(csvinfile)
for row in datareader:
readcount += 1
if readcount == 1: # skip the header record
continue
elif readcount % 100000 == 0:
print("Records read: ", readcount, "\nRecords written:", writecount)
elif filter:
if (row[filtercol] == filterval):
datawriter.writerow(row)
writecount += 1
else:
datawriter.writerow(row)
writecount += 1
print("Records read: ", readcount, "\nRecords written:", writecount)