Skip to content

Commit

Permalink
Improved UnorderedCSVDiffer speed (#615)
Browse files Browse the repository at this point in the history
* cleaned up

* cleanup

* checked out dataobject-rework tests file
  • Loading branch information
PaulTalbot-INL authored and alfoa committed Apr 18, 2018
1 parent da1236b commit 34ad078
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 43 deletions.
98 changes: 56 additions & 42 deletions scripts/TestHarness/testers/UnorderedCSVDiffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, test_dir, out_files,relative_error=1e-10,absolute_check=False
self.__test_dir = test_dir
self.__check_absolute_values = absolute_check
self.__rel_err = relative_error
self.__zero_threshold = float(zeroThreshold) if zeroThreshold is not None else None
self.__zero_threshold = float(zeroThreshold) if zeroThreshold is not None else 0.0
if debug or whoAmI:
print('test dir :',self.__test_dir)
print('out files:',self.__out_files)
Expand Down Expand Up @@ -80,13 +80,13 @@ def findRow(self,row,csv):
print('Looking for:\n',row)
print('Looking in:\n',csv)
match = csv.copy()
# reduce all values under threshold to 0
#match = match.replace(np.inf,-sys.maxint)
#match = match.replace(np.nan,sys.maxint)
# mask inf as -sys.max and nan as +sys.max
# TODO can I do this as a single search, using binomial on floats +- rel_err?
for idx, val in row.iteritems():
if debug:
print(' checking index',idx)
print(' checking index',idx,'value',val)
# Due to relative matches in floats, we may not be sorted with respect to this index.
## In an ideal world with perfect matches, we would be. Unfortunately, we have to sort again.
match = match.sort_values(idx)
# check type consistency
## get a sample from the matching CSV column
### TODO could check indices ONCE and re-use instead of checking each time
Expand All @@ -99,41 +99,52 @@ def findRow(self,row,csv):
if debug:
print(' Not same type (number)! lfor: "{}" lin: "{}"'.format(valIsNumber,matchIsNumber))
return []
# compare
# process: determine a condition "cond" whereby to reduce the possible matches
## if not a number, absolute check
if not valIsNumber:
cond = match[idx] == val
## if a number, condition depends on chosen tools
# find index of lowest and highest possible matches
## if values are floats, then matches could be as low as val(1-rel_err) and as high as val(1+rel_err)
if matchIsNumber:
# adjust for negative values
sign = np.sign(val)
lowest = np.searchsorted(match[idx].values,val*(1.0-sign*self.__rel_err))
highest = np.searchsorted(match[idx].values,val*(1.0+sign*self.__rel_err),side='right')-1
## if not floats, then check exact matches
else:
## apply zero threshold
#if False:# FIXME tooooo slooooowwwwww self.__zero_threshold is not None:
# match[idx][abs(match[idx]) < self.__zero_threshold] = 0
# if debug:
# print(' After applying zero threshold, options are:',match[idx])
# val = 0 if abs(val) < self.__zero_threshold else val
## mask infinity
#if val == np.inf:
# val = -sys.maxint
#elif pd.isnull(val):
# val = sys.maxint
## value check: absolute
if self.__check_absolute_values:
cond = abs(match[idx] - val) < self.__rel_err
## value check: relative
else:
# set relative scaling factor to protect against div by 0
if val == 0:
scale = 1.0
else:
scale = abs(val)
cond = abs(match[idx] - val) < scale*self.__rel_err
# limit matches by condition determined
match = match[cond]
lowest = np.searchsorted(match[idx].values,val)
highest = np.searchsorted(match[idx].values,val,side='right')-1
if debug:
print(' low/hi match index:',lowest,highest)
## if lowest is past end of array, no match found
if lowest == len(match[idx]):
if debug:
print(' Match is past end of sort list!')
return []
## if entry at lowest index doesn't match entry, then it's not to be found
if not self.matches(match[idx].values[lowest],val,matchIsNumber,self.__rel_err):
if debug:
print(' Match is not equal to insert point!')
return []
## otherwise, we have some range of matches
match = match[slice(lowest,highest+1)]
if debug:
print(' After searching for {}={}, remaining matches:\n'.format(idx,val),match)
return match

def matches(self,a,b,isNumber,tol):
"""
Determines if two objects match within tolerance.
@ In, a, object, first object ("measured")
@ In, b, object, second object ("actual")
@ In, isNumber, bool, if True then treat as float with tolerance (else check equivalence)
@ In, tol, float, tolerance at which to hold match (if float)
@ Out, matches, bool, True if matching
"""
if not isNumber:
return a == b
if self.__check_absolute_values:
return abs(a-b) < tol
# otherwise, relative error
scale = abs(b) if b != 0 else 1.0
return abs(a-b) < scale*tol

def diff(self):
"""
Run the comparison.
Expand Down Expand Up @@ -201,9 +212,9 @@ def diff(self):
## at this point both CSVs have the same shape, with the same header contents.
## align columns
testCSV = testCSV[goldCSV.columns.tolist()]
## set marginal values to zero
testCSV = self.marginalizeZeros(testCSV,self.__zero_threshold)
goldCSV = self.marginalizeZeros(goldCSV,self.__zero_threshold)
## set marginal values to zero, fix infinites
testCSV = self.prepDataframe(testCSV,self.__zero_threshold)
goldCSV = self.prepDataframe(goldCSV,self.__zero_threshold)
## check for matching rows
for idx in goldCSV.index:
find = goldCSV.iloc[idx].rename(None)
Expand All @@ -216,9 +227,11 @@ def diff(self):
self.finalizeMessage(same,msg,testFilename)
return self.__same, self.__message

def marginalizeZeros(self,csv,tol):
def prepDataframe(self,csv,tol):
"""
For any columns that contain numbers, drop near-zero numbers to zero
Does several prep actions:
- For any columns that contain numbers, drop near-zero numbers to zero
- replace infs and nans with symbolic values
@ In, csv, pd.DataFrame, contents to reduce
@ In, tol, float, tolerance sufficently near zero
@ Out, csv, converted dataframe
Expand All @@ -234,6 +247,7 @@ def marginalizeZeros(self,csv,tol):
if not mathUtils.isAFloatOrInt(example):
continue
# flatten near-zeros
csv[col][np.isclose(csv[col],0,**key)] = 0
csv[col].values[np.isclose(csv[col].values,0,**key)] = 0
# TODO would like to sort here, but due to relative errors it doesn't do enough good. Instead, sort in findRow.
return csv

8 changes: 7 additions & 1 deletion scripts/TestHarness/testers/testUnorderedCSV.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def checkSame(comment,first,second,msg,results=None):
return results

def testAFile(fname):
differ = UCSV('.',[fname])
differ = UCSV('.',[fname],zeroThreshold=5e-14)
differ.diff()
return differ.__dict__['_UnorderedCSVDiffer__same'], differ.__dict__['_UnorderedCSVDiffer__message']

Expand All @@ -46,6 +46,12 @@ def testAFile(fname):
# matching with inf, nan
ok,msg = testAFile('inf.csv')
checkSame('Infinity',ok,True,msg,results)
# zero threshold
ok,msg = testAFile('nearzero.csv')
checkSame('Near zero',ok,True,msg,results)
# sorting
ok,msg = testAFile('sort.csv')
checkSame('sort',ok,True,msg,results)



Expand Down

0 comments on commit 34ad078

Please sign in to comment.