Skip to content

Commit

Permalink
Reworked cNDarray, no more replicating indexes (#484)
Browse files Browse the repository at this point in the history
* added a synch checking that is too slow

* stash

* stash before merging in develop branch

* stash

* moving back to desktop

* got types back

* all types including histories now preserved

* point set now fixed

* dummy fix, mergefix

* whitespace

* review comment cleanup
  • Loading branch information
PaulTalbot-INL authored and wangcj05 committed Dec 19, 2017
1 parent be18e77 commit 3467d7b
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 164 deletions.
9 changes: 4 additions & 5 deletions framework/DataObjects/TestXDataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def formatRealization(rlz):
# NOTE histories are currently disabled pending future work (c,y are history vars)
checkArray('DataSet __init__ inp',data._inputs,['a','b','c'],str)
checkArray('DataSet __init__ out',data._outputs,['x','y','z'],str)
checkArray('DataSet __init__ all',data._allvars,['a','b','c','x','y','z'],str)
checkArray('DataSet __init__ all',data.vars,['a','b','c','x','y','z'],str)
checkNone('DataSet __init__ _data',data._data)
checkNone('DataSet __init__ _collector',data._collector)

Expand Down Expand Up @@ -619,6 +619,7 @@ def formatRealization(rlz):
dataCSV.messageHandler = mh
dataCSV._readMoreXML(xml)
dataCSV.load(csvname,style='CSV')

for var in data.getVars():
if var == 'z':
# not included in XML input specs, so should be left out
Expand Down Expand Up @@ -766,7 +767,6 @@ def formatRealization(rlz):
checkArray('Remove variable remaining vars',data.getVars(),['a'],str)
checkRlz('Remove variable rlz -1',data.realization(index=-1),rlz)
# collapse and re-check
print('PRE:',data._data)
data.asDataset()
checkArray('Remove variable remaining vars',data.getVars(),['a'],str)
checkRlz('Remove variable rlz -1',data.realization(index=-1),rlz)
Expand Down Expand Up @@ -824,7 +824,7 @@ def formatRealization(rlz):
'y': np.array([200.1]),
'varsUpdate': np.array([1])}
data.addRealization(rlz1_1)
tid = data._collector[-1,data._allvars.index('trajID')]
tid = data._collector[-1,data._orderedVars.index('trajID')]
checkRlz('Cluster extend traj 2[1]',data.realization(matchDict={'trajID':2,'varsUpdate':1})[1],rlz1_1,skip='varsUpdate')
# print it
fname = 'XDataUnitTestClusterLabels'
Expand Down Expand Up @@ -916,7 +916,7 @@ def formatRealization(rlz):
'dbo':np.array([ False, True, False]),
't':np.array(['one','two','manystringchars'])}
data.addRealization(rlz)
#print('DEBUGG first',data.asDataset())
data.asDataset()
# check types
for var in rlz.keys():
correct = rlz[var].dtype
Expand All @@ -925,7 +925,6 @@ def formatRealization(rlz):
checkSame('dtype checking "{}"'.format(var),data.asDataset()[var].dtype,correct)

data.addRealization(rlz2)
#print('DEBUGG second',data.asDataset())

print(results)

Expand Down
2 changes: 1 addition & 1 deletion framework/DataObjects/TestXHistorySet.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def formatRealization(rlz):
# NOTE histories are currently disabled pending future work (c,y are history vars)
checkArray('HistorySet __init__ inp',data._inputs,['a','b'],str)
checkArray('HistorySet __init__ out',data._outputs,['x','y'],str)
checkArray('HistorySet __init__ all',data._allvars,['a','b','x','y'],str)
checkArray('HistorySet __init__ all',data._orderedVars,['a','b','x','y'],str)
checkNone('HistorySet __init__ _data',data._data)
checkNone('HistorySet __init__ _collector',data._collector)

Expand Down
2 changes: 1 addition & 1 deletion framework/DataObjects/TestXPointSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def formatRealization(rlz):
data._readMoreXML(xml)
checkArray('DataSet __init__ inp',data._inputs,['a','b'],str)
checkArray('DataSet __init__ out',data._outputs,['x','z'],str)
checkArray('DataSet __init__ all',data._allvars,['a','b','x','z'],str)
checkArray('DataSet __init__ all',data._orderedVars,['a','b','x','z'],str)
checkNone('DataSet __init__ _data',data._data)
checkNone('DataSet __init__ _collector',data._collector)

Expand Down
4 changes: 2 additions & 2 deletions framework/DataObjects/XDataObject.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def __init__(self):
self._inputs = [] # list(str) if input variables
self._outputs = [] # list(str) of output variables
self._metavars = [] # list(str) of POINTWISE metadata variables
self._allvars = [] # list(str) of vars IN ORDER of their index
self._orderedVars = [] # list(str) of vars IN ORDER of their index

self._meta = {} # dictionary to collect meta until data is collapsed
self._heirarchal = False # if True, non-traditional format (not yet implemented)
Expand Down Expand Up @@ -201,7 +201,7 @@ def _readMoreXML(self,xmlNode):
self._inputs.remove(index)
except ValueError:
pass #not requested as input anyway
self._allvars = self._inputs + self._outputs
self._orderedVars = self._inputs + self._outputs
if self.messageHandler is None:
self.messageHandler = MessageCourier()

Expand Down
228 changes: 158 additions & 70 deletions framework/DataObjects/XDataSet.py

Large diffs are not rendered by default.

31 changes: 2 additions & 29 deletions framework/DataObjects/XHistorySet.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,34 +95,7 @@ def _setDefaultPivotParams(self):
self._pivotParams = {self._tempPivotParam:self._outputs[:]}

### INTERNAL USE FUNCTIONS ###
def _collapseNDtoDataArray(self,data,var,labels=None):
"""
Converts a row of numpy samples into a single DataArray suitable for a xr.Dataset.
@ In, data, np.ndarray, array of either float or xr.DataArray; array must be single-dimension
@ In, var, str, name of the variable being acted on
@ In, labels, list, list of labels to use for collapsed array under self.sampleTag title
@ Out, DataArray, xr.DataArray, single dataarray object
"""
# TODO this is only type-checking before using the base class implementation.
## TODO these assertions are identical to the base class right now; should abstract
assert(isinstance(data,np.ndarray))
assert(len(data.shape) == 1)
if labels is None:
labels = range(len(data))
else:
assert(len(labels) == len(data))
## these assertions are specific to history sets -> should they be in addRealization instead?
# Inputs and meta should all be single entries, outputs should all be xr.DataArray that depend only on pivotParam
if var in self._inputs:
assert(isinstance(data[0],(float,str,unicode,int)))
elif var in self._outputs:
# all outputs are xr.DataArrays
assert(isinstance(data[0],xr.DataArray))
# all outputs have a single independent coordinate
assert(len(data[0].dims) == 1)
# all outputs depend only on the pivot parameter
assert(data[0].dims[0] == self._pivotParams.keys()[0])
return DataSet._collapseNDtoDataArray(self,data,var,labels)


def _fromCSV(self,fileName,**kwargs):
"""
Expand Down Expand Up @@ -217,7 +190,7 @@ def _toCSV(self,fileName,start=0,**kwargs):
else:
data = self._data
mode = 'w'
toDrop = list(var for var in self._allvars if var not in keep)
toDrop = list(var for var in self._orderedVars if var not in keep)
data = data.drop(toDrop)
self.raiseADebug('Printing data to CSV: "{}"'.format(fileName+'.csv'))
# specific implementation
Expand Down
39 changes: 6 additions & 33 deletions framework/DataObjects/XPointSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,34 +93,6 @@ def _readMoreXML(self,xmlNode):
self._selectOutput = ('outputRow',-1)

### INTERNAL USE FUNCTIONS ###
def _collapseNDtoDataArray(self,data,var,labels=None):
"""
Converts a row of numpy samples into a single DataArray suitable for a xr.Dataset.
@ In, data, np.ndarray, array of either float or xr.DataArray; array must be single-dimension
@ In, var, str, name of the variable being acted on
@ In, labels, list, list of labels to use for collapsed array under self.sampleTag title
@ Out, DataArray, xr.DataArray, single dataarray object
"""
# TODO this is slightly different but quite similar to the base class. Should it be separate?
assert(isinstance(data,np.ndarray))
assert(len(data.shape) == 1)
if labels is None:
labels = range(len(data))
else:
assert(len(labels) == len(data))
# ALL should be floats or otherwise 1d
#assert(isinstance(data[0],(float,str,unicode,int,type(None)))) # --> in LimitSurfaceSearch, first can be "None", floats come later
try:
assert(isinstance(data[0],(float,str,unicode,int,))) # --> in LimitSurfaceSearch, first can be "None", floats come later
except AssertionError as e:
raise e
array = xr.DataArray(data,
dims=[self.sampleTag],
coords={self.sampleTag:labels},
name=var)
array.rename(var)
return array

def _convertFinalizedDataRealizationToDict(self,rlz, unpackXArray=False):
"""
After collapsing into xr.Dataset, all entries are stored as xr.DataArrays.
Expand Down Expand Up @@ -157,12 +129,9 @@ def _selectiveRealization(self,rlz):
elif var in self._outputs or var in self._metavars:
# TODO where does metadata get picked from? Seems like output fits best?
method,indic = self._selectOutput
# pivot variables might be included here; try removing them
elif var in self.indexes:
continue # don't need to handle coordinate dimensions, they come with values
# pivot variables are included here in "else"; remove them after they're used in operators
else:
toRemove.append(var)
print('DEBUGG unhandled:',var)
continue
if method in ['inputRow','outputRow']:
# zero-d xarrays give false behavior sometimes
Expand All @@ -180,7 +149,11 @@ def _selectiveRealization(self,rlz):
pivotParam = self.getDimensions(var)
assert(len(pivotParam) == 1) # TODO only handle History for now
pivotParam = pivotParam[var][0]
rlz[var] = float(val.sel(**{pivotParam:indic, 'method':b'nearest'})) #casting as str not unicode
idx = (np.abs(rlz[pivotParam] - indic)).argmin()
rlz[var] = rlz[var][idx]
# if history is dataarray -> not currently possible, but keep for when it's needed
#if type(rlz[var]).__name__ == 'DataArray':
# rlz[var] = float(val.sel(**{pivotParam:indic, 'method':b'nearest'})) #casting as str not unicode
# TODO allowing inexact matches; it's finding the nearest
elif method == 'operator':
if indic == 'max':
Expand Down
2 changes: 1 addition & 1 deletion framework/Models/Dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _inputToInternal(self,dataIN):
for entries in dataIN.getVars('input'):
if localInput[entries] is None:
localInput[entries] = []
value = dataSet.isel(**{dataSet.sampleTag:hist})[entries].values
value = dataSet.isel(**{dataIn.sampleTag:hist})[entries].values
localInput[entries].append(np.full((sizeIndex,),value,dtype=value.dtype))
#Now if an OutputPlaceHolder is used it is removed, this happens when the input data is not representing is internally manufactured
if 'OutputPlaceHolder' in dataIN.getVars('output'):
Expand Down
39 changes: 17 additions & 22 deletions framework/utils/cached_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,41 +339,36 @@ def append(self,entry):
if type(entry) not in [np.ndarray]:
raise IOError('Tried to add new data to cNDarray. Can only accept np.ndarray, but got '+type(entry).__name__)
# for now require full correct shape, later handle the single entry case
if len(entry.shape)!=2:
if len(entry.shape)!=1:
# TODO single entry case
raise IOError('Tried to add new data to cNDarray. Need shape (#,{}) but got "{}"!'.format(self.width,entry.shape))
raise IOError('Tried to add new data to cNDarray. Need shape ({},) but got "{}"!'.format(self.width,entry.shape))
# must have matching width (fix for single entry case)
if entry.shape[1] != self.width:
raise IOError('Tried to add new data to cNDarray. Need {} entities per entry, but got '.format(self.width)+str(entry.shape[1]))
if entry.shape[0] != self.width:
raise IOError('Tried to add new data to cNDarray. Need {} entries in array, but got '.format(self.width)+str(entry.shape[0]))
# check if there's enough space in cache to append the new entries
if self.size + entry.shape[0] > self.capacity:
if self.size + 1 > self.capacity:
# since there's not enough space, quadruple available space # TODO change growth parameter to be variable?
self.capacity += max(self.capacity*4,entry.shape[0])
self.capacity += self.capacity*3
newdata = np.zeros((self.capacity,self.width),dtype=self.values.dtype)
newdata[:self.size] = self.values[:self.size]
self.values = newdata
self.values[self.size:self.size+entry.shape[0]][:] = entry[:]
self.size += entry.shape[0]
self.values[self.size] = entry[:]
self.size += 1

def addEntity(self,vals,firstEver=False):
"""
Adds a column to the dataset.
@ In, vals, list of np.array([ [#],[#],[#] ], dtype = float or xr.DataArray), fill values (each entry must be shape==(self.size,num new entites))
@ In, vals, list, as list(#,#,#) where # is either single-valued or numpy array
@ Out, None
"""
# example 1: for 1 new entity with sample values [1,2,3], "vals" should be:
# [ np.array([[1],[2],[3]]) ] (note expecially the outermost list)
# example 2: for 2 new entities with sample values [1,2,3] and [4,5,6], "vals" should be:
# [ np.array([[1],[2],[3]]), np.array([[4],[5],[6]]) ]
for i,v in enumerate(vals):
# FIXME slow assertion check
if len(v) != self.size:
raise IOError('Wrong number ({}) of initial values passed to add entity! Need {}.'.format(len(v),self.size))
# FIXME slow reshaping
new = np.ndarray((self.capacity,1),dtype=object)
new[:self.size] = v[:]
vals[i] = new
self.values = np.hstack([self.values] + vals)
# create a new column with up to the cached capacity
new = np.ndarray(self.capacity,dtype=object)
# fill up to current filled size with the values
new[:self.size] = vals
# reshape so it can be stacked onto the existing data
new = new.reshape(self.capacity,1)
# "hstack" stacks along the second dimension, or columns for us
self.values = np.hstack((self.values,new))
self.width += 1

def getData(self):
Expand Down
1 change: 1 addition & 0 deletions tests/framework/utils/testCachedNDArray.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def checkAnswer(comment,value,expected,tol=1e-10,updateResults=True):
#test min
checkAnswer('index min',testArray.returnIndexMin(),5)


#test repr
msg = str(testArray)
right = 'array([ -3.14 , 2.99792, 2.718 , 8.987 , 0.618 , -6.626 ,\n 12.56 , 6.67 ])'
Expand Down

0 comments on commit 3467d7b

Please sign in to comment.