Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Doing better than "object" typing in DataObject #478

Merged
merged 2 commits into from
Dec 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 72 additions & 25 deletions framework/DataObjects/TestXDataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
import MessageHandler

mh = MessageHandler.MessageHandler()
mh.initialize({'verbosity':'silent', 'callerLength':10, 'tagLength':10})
mh.initialize({'verbosity':'debug', 'callerLength':10, 'tagLength':10})

print('Module undergoing testing:')
print (XDataSet )
Expand Down Expand Up @@ -791,42 +791,43 @@ def formatRealization(rlz):
# register "trajID" (cluster label) and "varsUpdate" (iteration number/monotonically increasing var) as meta
data.addExpectedMeta(['trajID','varsUpdate'])
# add two trajectories to get started, like starting two trajectories
rlz0_0 = {'trajID': np.atleast_1d(1),
'a': np.atleast_1d( 1.0),
'b': np.atleast_1d( 5.0),
'x': np.atleast_1d( 10.0),
'y': np.atleast_1d(100.0),
'varsUpdate': np.atleast_1d(0)}
rlz1_0 = {'trajID': np.atleast_1d(2),
'a': np.atleast_1d( 2.0),
'b': np.atleast_1d( 6.0),
'x': np.atleast_1d( 20.0),
'y': np.atleast_1d(200.0),
'varsUpdate': np.atleast_1d(0)}
rlz0_0 = {'trajID': np.array([1]),
'a': np.array([ 1.0]),
'b': np.array([ 5.0]),
'x': np.array([ 10.0]),
'y': np.array([100.0]),
'varsUpdate': np.array([0])}
rlz1_0 = {'trajID': np.array([2]),
'a': np.array([ 2.0]),
'b': np.array([ 6.0]),
'x': np.array([ 20.0]),
'y': np.array([200.0]),
'varsUpdate': np.array([0])}
data.addRealization(rlz0_0)
data.addRealization(rlz1_0)
checkRlz('Cluster initial traj 1',data.realization(index=0),rlz0_0,skip='varsUpdate')
checkRlz('Cluster initial traj 2',data.realization(index=1),rlz1_0,skip='varsUpdate')
# now sample a new trajectory point, going into the collector
rlz0_1 = {'trajID': np.atleast_1d(1),
'a': np.atleast_1d( 1.1),
'b': np.atleast_1d( 5.1),
'x': np.atleast_1d( 10.1),
'y': np.atleast_1d(100.1),
'varsUpdate': np.atleast_1d(1)}
rlz0_1 = {'trajID': np.array([1]),
'a': np.array([ 1.1]),
'b': np.array([ 5.1]),
'x': np.array([ 10.1]),
'y': np.array([100.1]),
'varsUpdate': np.array([1])}
data.addRealization(rlz0_1)
checkRlz('Cluster extend traj 1[0]',data.realization(matchDict={'trajID':1,'varsUpdate':0})[1],rlz0_0,skip='varsUpdate')
checkRlz('Cluster extend traj 1[1]',data.realization(matchDict={'trajID':1,'varsUpdate':1})[1],rlz0_1,skip='varsUpdate')
checkRlz('Cluster extend traj 2[0]',data.realization(matchDict={'trajID':2,'varsUpdate':0})[1],rlz1_0,skip='varsUpdate')
# now collapse and then append to the data
data.asDataset()
rlz1_1 = {'trajID': np.atleast_1d(2),
'a': np.atleast_1d( 2.1),
'b': np.atleast_1d( 6.1),
'x': np.atleast_1d( 20.1),
'y': np.atleast_1d(200.1),
'varsUpdate': np.atleast_1d(1)}
rlz1_1 = {'trajID': np.array([ 2]),
'a': np.array([ 2.1]),
'b': np.array([ 6.1]),
'x': np.array([ 20.1]),
'y': np.array([200.1]),
'varsUpdate': np.array([1])}
data.addRealization(rlz1_1)
tid = data._collector[-1,data._allvars.index('trajID')]
checkRlz('Cluster extend traj 2[1]',data.realization(matchDict={'trajID':2,'varsUpdate':1})[1],rlz1_1,skip='varsUpdate')
# print it
fname = 'XDataUnitTestClusterLabels'
Expand Down Expand Up @@ -884,6 +885,52 @@ def formatRealization(rlz):
checkRlz('Cluster read [1]',data2.realization(index=1),correct)


######################################
# DATA TYPING #
######################################
## check that types are set correctly, both for histories and scalars
xml = createElement('DataSet',attrib={'name':'test'})
xml.append(createElement('Input', text=' fl, in, st, un, bo'))
xml.append(createElement('Output',text='dfl,din,dst,dun,dbo'))
xml.append(createElement('Index',attrib={'var':'t'},text='dfl,din,dst,dun,dbo'))
data = XDataSet.DataSet()
data.messageHandler = mh
data._readMoreXML(xml)

rlz = {'fl' :np.array([ 1.0]),
'in' :np.array([ 2]),
'st' :np.array([ 'msg']),
'un' :np.array([u'utf']),
'bo' :np.array([ True]),
'dfl':np.array([ 1.0, 2.0, 3.0]),
'din':np.array([ 4, 5, 6]),
'dst':np.array([ 'a', 'b', 'c']),
'dun':np.array([ u'x', u'y', u'z']),
'dbo':np.array([ True,False, True]),
't':np.array(['one','two','three'])}
rlz2= {'fl' :np.array([ 10.0]),
'in' :np.array([ 20]),
'st' :np.array([ 'msg2']),
'un' :np.array([u'utf2']),
'bo' :np.array([ False]),
'dfl':np.array([ 10.0, 20.0, 30.0]),
'din':np.array([ 40, 50, 60]),
'dst':np.array([ 'a2', 'b2', 'c2']),
'dun':np.array([ u'x2', u'y2', u'z2']),
'dbo':np.array([ False, True, False]),
't':np.array(['one','two','manystringchars'])}
data.addRealization(rlz)
#print('DEBUGG first',data.asDataset())
# check types
for var in rlz.keys():
correct = rlz[var].dtype
if correct.type in [np.unicode_,np.string_]:
correct = object
checkSame('dtype checking "{}"'.format(var),data.asDataset()[var].dtype,correct)

data.addRealization(rlz2)
#print('DEBUGG second',data.asDataset())

print(results)

sys.exit(results["fail"])
Expand Down
6 changes: 3 additions & 3 deletions framework/DataObjects/XDataObject.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ def _readMoreXML(self,xmlNode):
for child in inp.subparts:
# TODO check for repeats, "notAllowdInputs", names in both input and output space
if child.getName() == 'Input':
self._inputs.extend(list(x for x in child.value.split(',') if x.strip()!=''))
self._inputs.extend(list(x.strip() for x in child.value.split(',') if x.strip()!=''))
elif child.getName() == 'Output':
self._outputs.extend(list(x for x in child.value.split(',') if x.strip()!=''))
self._outputs.extend(list(x.strip() for x in child.value.split(',') if x.strip()!=''))
elif child.getName() == 'Index':
depends = child.value.split(',')
depends = list(d.strip() for d in child.value.split(','))
var = child.parameterValues['var']
self._pivotParams[var] = depends
# options node
Expand Down
100 changes: 87 additions & 13 deletions framework/DataObjects/XDataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def addMeta(self,tag,xmlDict):
# Otherwise, scalarMetric
else:
# sanity check to make sure suitable values are passed in
assert(isinstance(value,(str,unicode,float,int)))
assert(isinstance(value,(basestring,float,int)))
destination.addScalar(target,metric,value)

def addRealization(self,rlz):
Expand Down Expand Up @@ -158,8 +158,10 @@ def addRealization(self,rlz):
rlz = self._formatRealization(rlz)
# perform selective collapsing/picking of data
rlz = self._selectiveRealization(rlz)
# FIXME if no scalar entry is made, this construction fails.
# Instead of treating each dataarrray as an object, numpy.asarray calls their asarray methods,
## establish types if not done yet
self._setDataTypes(rlz)
# NB If no scalar entry is made, this construction fails. In that case,
# instead of treating each dataarrray as an object, numpy.asarray calls their asarray methods,
# unfolding them and making a full numpy array with more dimensions, instead of effectively
# a list of realizations, where each realization is effectively a list of xr.DataArray objects.
#
Expand All @@ -170,6 +172,7 @@ def addRealization(self,rlz):
# each of which is a numpy array of some combination of scalar values and/or xr.DataArrays.
# This is because the cNDarray collector expects a LIST of realization, not a single realization.
# Maybe the "append" method should be renamed to "extend" or changed to append one at a time.
## set realizations as a list of realizations (which are ordered lists)
newData = np.asarray([list(rlz[var] for var in self._allvars)+[0.0]],dtype=object)
newData = newData[:,:-1]
# if data storage isn't set up, set it up
Expand Down Expand Up @@ -231,7 +234,7 @@ def checkIndexAlignment(self,indexesToCheck=None):
@ Out, same, bool, if True then alignment is good
"""
# format request so that indexesToCheck is always a list
if isinstance(indexesToCheck,(str,unicode)):
if isinstance(indexesToCheck,basestring):
indexesToCheck = [indexesToCheck]
elif indexesToCheck is None:
indexesToCheck = self.indexes[:]
Expand Down Expand Up @@ -352,7 +355,7 @@ def getVarValues(self,var):
# For faster access, consider using data.asDataset()['varName'] for one variable, or
# data.asDataset()[ ('var1','var2','var3') ] for multiple.
self.asDataset()
if isinstance(var,(str,unicode)):
if isinstance(var,basestring):
val = self._data[var]
#format as scalar
if len(val.dims) == 0:
Expand Down Expand Up @@ -496,6 +499,16 @@ def remove(self,realization=None,variable=None):
for varlist in [self._inputs,self._outputs,self._metavars]:
if variable in varlist:
varlist.remove(variable)
# remove from pivotParams, and remove any indexes without keys
for pivot in self.indexes:
print('DEBUGG checking pivot',pivot,self._pivotParams[pivot])
if variable in self._pivotParams[pivot]:
self._pivotParams[pivot].remove(variable)
if len(self._pivotParams[pivot]) == 0:
del self._pivotParams[pivot]
# if in self._data, clear the index
if not noData and pivot in self._data.dims:
del self._data[pivot] # = self._data.drop(pivot,dim
# TODO remove references from general metadata?
if self._scaleFactors is not None:
self._scaleFactors.pop(variable,None)
Expand Down Expand Up @@ -570,6 +583,7 @@ def __init__(self):#, in_vars, out_vars, meta_vars=None, dynamic=False, var_dims
DataObject.__init__(self)
self.name = 'DataSet'
self.type = 'DataSet'
self.types = None # list of type objects, for each realization entry
self.printTag = self.name
self.defaultDtype = object
self._scaleFactors = {} # mean, sigma for data for matching purposes
Expand Down Expand Up @@ -640,13 +654,14 @@ def _collapseNDtoDataArray(self,data,var,labels=None):
"""
assert(isinstance(data,np.ndarray))
assert(len(data.shape) == 1)
# set up sampleTag values
if labels is None:
labels = range(len(data))
else:
assert(len(labels) == len(data))
#method = 'once' # see below, parallelization is possible but not implemented
# first case: single entry per node: floats, strings, ints, etc
if isinstance(data[0],(float,str,unicode,int)):
if isinstance(data[0],(float,basestring,int,bool,np.bool_)):
array = xr.DataArray(data,
dims=[self.sampleTag],
coords={self.sampleTag:labels},
Expand Down Expand Up @@ -728,7 +743,8 @@ def _convertArrayListToDataset(self,array,action=None):
elif action == 'extend':
# TODO compatability check!
# TODO Metadata update?
self._data.merge(new,inplace=True)
# merge can change dtypes b/c no NaN int type: self._data.merge(new,inplace=True)
self._data = xr.concat([self._data,new],dim=self.sampleTag)
# set up scaling factors
self._setScalingFactors()
return new
Expand Down Expand Up @@ -804,8 +820,16 @@ def _convertToXrDataset(self):
firstSample = int(self._data[self.sampleTag][-1])+1 if self._data is not None else 0
arrs = {}
for v,var in enumerate(self._allvars):
# gather data type from first realization
dtype = object = self.types[v]
# if not scalar, then dtype is embedded below this level
if isinstance(data[0,v],xr.DataArray):
varData = data[:,v]
# if scalar, however, need to force correct dtype here
else:
varData = np.array(data[:,v],dtype=dtype)
# create single dataarrays
arrs[var] = self._collapseNDtoDataArray(data[:,v],var)
arrs[var] = self._collapseNDtoDataArray(varData,var)
# re-index samples
arrs[var][self.sampleTag] += firstSample
# collect all data into dataset, and update self._data
Expand All @@ -814,7 +838,7 @@ def _convertToXrDataset(self):
else:
self._convertArrayListToDataset(arrs,action='extend')
# reset collector
self._collector = self._newCollector(width=self._collector.width,dtype=self._collector.values.dtype)
self._collector = self._newCollector(width=self._collector.width)
return self._data

def _formatRealization(self,rlz):
Expand All @@ -825,11 +849,14 @@ def _formatRealization(self,rlz):
"""
# TODO this could be much more efficient on the parallel (finalizeCodeOutput) than on serial
# TODO costly for loop
indexes = []
# do indexes first to assure correct typing on first realization collection
if self._collector is None or len(self._collector) == 0:
for var in self._pivotParams.keys():
dtype = self._getCompatibleType(rlz[var][0])
rlz[var] = np.array(rlz[var],dtype=dtype)
for var,val in rlz.items():
# if an index variable, skip it and mark it for removal
# if an index variable, skip it
if var in self._pivotParams.keys():
indexes.append(var)
continue
dims = self.getDimensions(var)[var]
## change dimensionless to floats -> TODO use operator to collapse!
Expand All @@ -839,8 +866,12 @@ def _formatRealization(self,rlz):
## reshape multidimensional entries into dataarrays
else:
coords = dict((d,rlz[d]) for d in dims)
# if first entry in collector, assure types are correct for the index
if self._collector is None or len(self._collector) == 0:
dtype = self._getCompatibleType(val[0])
val = np.array(val,dtype=dtype)
rlz[var] = self.constructNDSample(val,dims,coords,name=var)
for var in indexes:
for var in self._pivotParams.keys():
del rlz[var]
return rlz

Expand Down Expand Up @@ -936,6 +967,10 @@ def _fromCSVXML(self,fName):
def _fromDict(self,source,dims=None,**kwargs):
"""
Loads data from a dictionary with variables as keys and values as np.arrays of realization values
Format for entries in "source":
- scalars: source['a'] = np.array([1, 2, 3, 4]) -> each entry is a realization
- vectors: source['b'] = np.array([ np.array([1, 2]), np.array([3,4,5]) ]) -> each entry is a realization
- indexes: same as "vectors"
@ In, source, dict, as {var:values} with types {str:np.array}
@ In, dims, dict, optional, ordered list of dimensions that each var depends on as {var:[list]}
@ In, kwargs, dict, optional, additional arguments
Expand Down Expand Up @@ -986,6 +1021,9 @@ def _fromDict(self,source,dims=None,**kwargs):
data[:,i] = values
# set up collector as cached nd array of values
self._collector = cached_ndarray.cNDarray(values=data,dtype=object)
# set datatypes for each variable
rlz = self.realization(index=0)
self._setDataTypes(rlz)
# collapse into xr.Dataset
self.asDataset()

Expand All @@ -1007,6 +1045,30 @@ def _fromNetCDF(self,fName, **kwargs):
for key,val in self._data.attrs.items():
self._meta[key] = pk.loads(val.encode('utf-8'))

def _getCompatibleType(self,val):
"""
Determines the data type for "val" that is compatible with the rest of the data object.
@ In, val, object, item whose type should be determined.
@ Out, _type, type instance, type to use
"""
# ND uses first entry as example type
if isinstance(val,xr.DataArray):
val = val.item(0)
# identify other scalars by instance
if isinstance(val,float):
_type = float
elif isinstance(val,int):
_type = int
elif isinstance(val,(bool,np.bool_)):
_type = bool
# strings and unicode have to be stored as objects to prevent string sizing in numpy
elif isinstance(val,basestring):
_type = object
# catchall
else:
_type = object
return _type

def _getRealizationFromCollectorByIndex(self,index):
"""
Obtains a realization from the collector storage using the provided index.
Expand Down Expand Up @@ -1200,6 +1262,18 @@ def _selectiveRealization(self,rlz,checkLengthBeforeTruncating=False):
self.raiseAnError(NotImplementedError,'Variable "{}" has no dimensions but has multiple values! Not implemented for DataSet yet.'.format(var))
return rlz

def _setDataTypes(self,rlz):
"""
Set the data types according to the given realization.
@ In, rlz, dict, standardized and formatted realization
@ Out, None
"""
if self.types is None:
self.types = [None]*len(self._allvars)
for v,name in enumerate(self._allvars):
var = rlz[name]
self.types[v] = self._getCompatibleType(var)

def _setScalingFactors(self,var=None):
"""
Sets the scaling factors for the data (mean, scale).
Expand Down