Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved UnorderedCSVDiffer speed #615

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 56 additions & 42 deletions scripts/TestHarness/testers/UnorderedCSVDiffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, test_dir, out_files,relative_error=1e-10,absolute_check=False
self.__test_dir = test_dir
self.__check_absolute_values = absolute_check
self.__rel_err = relative_error
self.__zero_threshold = float(zeroThreshold) if zeroThreshold is not None else None
self.__zero_threshold = float(zeroThreshold) if zeroThreshold is not None else 0.0
if debug or whoAmI:
print('test dir :',self.__test_dir)
print('out files:',self.__out_files)
Expand Down Expand Up @@ -80,13 +80,13 @@ def findRow(self,row,csv):
print('Looking for:\n',row)
print('Looking in:\n',csv)
match = csv.copy()
# reduce all values under threshold to 0
#match = match.replace(np.inf,-sys.maxint)
#match = match.replace(np.nan,sys.maxint)
# mask inf as -sys.max and nan as +sys.max
# TODO can I do this as a single search, using binomial on floats +- rel_err?
for idx, val in row.iteritems():
if debug:
print(' checking index',idx)
print(' checking index',idx,'value',val)
# Due to relative matches in floats, we may not be sorted with respect to this index.
## In an ideal world with perfect matches, we would be. Unfortunately, we have to sort again.
match = match.sort_values(idx)
# check type consistency
## get a sample from the matching CSV column
### TODO could check indices ONCE and re-use instead of checking each time
Expand All @@ -99,41 +99,52 @@ def findRow(self,row,csv):
if debug:
print(' Not same type (number)! lfor: "{}" lin: "{}"'.format(valIsNumber,matchIsNumber))
return []
# compare
# process: determine a condition "cond" whereby to reduce the possible matches
## if not a number, absolute check
if not valIsNumber:
cond = match[idx] == val
## if a number, condition depends on chosen tools
# find index of lowest and highest possible matches
## if values are floats, then matches could be as low as val(1-rel_err) and as high as val(1+rel_err)
if matchIsNumber:
# adjust for negative values
sign = np.sign(val)
lowest = np.searchsorted(match[idx].values,val*(1.0-sign*self.__rel_err))
highest = np.searchsorted(match[idx].values,val*(1.0+sign*self.__rel_err),side='right')-1
## if not floats, then check exact matches
else:
## apply zero threshold
#if False:# FIXME tooooo slooooowwwwww self.__zero_threshold is not None:
# match[idx][abs(match[idx]) < self.__zero_threshold] = 0
# if debug:
# print(' After applying zero threshold, options are:',match[idx])
# val = 0 if abs(val) < self.__zero_threshold else val
## mask infinity
#if val == np.inf:
# val = -sys.maxint
#elif pd.isnull(val):
# val = sys.maxint
## value check: absolute
if self.__check_absolute_values:
cond = abs(match[idx] - val) < self.__rel_err
## value check: relative
else:
# set relative scaling factor to protect against div by 0
if val == 0:
scale = 1.0
else:
scale = abs(val)
cond = abs(match[idx] - val) < scale*self.__rel_err
# limit matches by condition determined
match = match[cond]
lowest = np.searchsorted(match[idx].values,val)
highest = np.searchsorted(match[idx].values,val,side='right')-1
if debug:
print(' low/hi match index:',lowest,highest)
## if lowest is past end of array, no match found
if lowest == len(match[idx]):
if debug:
print(' Match is past end of sort list!')
return []
## if entry at lowest index doesn't match entry, then it's not to be found
if not self.matches(match[idx].values[lowest],val,matchIsNumber,self.__rel_err):
if debug:
print(' Match is not equal to insert point!')
return []
## otherwise, we have some range of matches
match = match[slice(lowest,highest+1)]
if debug:
print(' After searching for {}={}, remaining matches:\n'.format(idx,val),match)
return match

def matches(self,a,b,isNumber,tol):
"""
Determines if two objects match within tolerance.
@ In, a, object, first object ("measured")
@ In, b, object, second object ("actual")
@ In, isNumber, bool, if True then treat as float with tolerance (else check equivalence)
@ In, tol, float, tolerance at which to hold match (if float)
@ Out, matches, bool, True if matching
"""
if not isNumber:
return a == b
if self.__check_absolute_values:
return abs(a-b) < tol
# otherwise, relative error
scale = abs(b) if b != 0 else 1.0
return abs(a-b) < scale*tol

def diff(self):
"""
Run the comparison.
Expand Down Expand Up @@ -201,9 +212,9 @@ def diff(self):
## at this point both CSVs have the same shape, with the same header contents.
## align columns
testCSV = testCSV[goldCSV.columns.tolist()]
## set marginal values to zero
testCSV = self.marginalizeZeros(testCSV,self.__zero_threshold)
goldCSV = self.marginalizeZeros(goldCSV,self.__zero_threshold)
## set marginal values to zero, fix infinites
testCSV = self.prepDataframe(testCSV,self.__zero_threshold)
goldCSV = self.prepDataframe(goldCSV,self.__zero_threshold)
## check for matching rows
for idx in goldCSV.index:
find = goldCSV.iloc[idx].rename(None)
Expand All @@ -216,9 +227,11 @@ def diff(self):
self.finalizeMessage(same,msg,testFilename)
return self.__same, self.__message

def marginalizeZeros(self,csv,tol):
def prepDataframe(self,csv,tol):
"""
For any columns that contain numbers, drop near-zero numbers to zero
Does several prep actions:
- For any columns that contain numbers, drop near-zero numbers to zero
- replace infs and nans with symbolic values
@ In, csv, pd.DataFrame, contents to reduce
@ In, tol, float, tolerance sufficently near zero
@ Out, csv, converted dataframe
Expand All @@ -234,6 +247,7 @@ def marginalizeZeros(self,csv,tol):
if not mathUtils.isAFloatOrInt(example):
continue
# flatten near-zeros
csv[col][np.isclose(csv[col],0,**key)] = 0
csv[col].values[np.isclose(csv[col].values,0,**key)] = 0
# TODO would like to sort here, but due to relative errors it doesn't do enough good. Instead, sort in findRow.
return csv

8 changes: 7 additions & 1 deletion scripts/TestHarness/testers/testUnorderedCSV.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def checkSame(comment,first,second,msg,results=None):
return results

def testAFile(fname):
differ = UCSV('.',[fname])
differ = UCSV('.',[fname],zeroThreshold=5e-14)
differ.diff()
return differ.__dict__['_UnorderedCSVDiffer__same'], differ.__dict__['_UnorderedCSVDiffer__message']

Expand All @@ -46,6 +46,12 @@ def testAFile(fname):
# matching with inf, nan
ok,msg = testAFile('inf.csv')
checkSame('Infinity',ok,True,msg,results)
# zero threshold
ok,msg = testAFile('nearzero.csv')
checkSame('Near zero',ok,True,msg,results)
# sorting
ok,msg = testAFile('sort.csv')
checkSame('sort',ok,True,msg,results)



Expand Down