Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Enable models for sparsely sampled fMRI series #414

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions bids/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,20 +386,32 @@ def get_design_matrix(self, names=None, format='long', mode='both',
kwargs['timing'] = True
kwargs['sparse'] = False

acquisition_time = None
if sampling_rate == 'TR':
trs = {var.run_info[0].tr for var in self.collection.variables.values()}
tas = {var.run_info[0].ta for var in self.collection.variables.values()}
if not trs:
raise ValueError("Repetition time unavailable; specify sampling_rate "
"explicitly")
elif len(trs) > 1:
raise ValueError("Non-unique Repetition times found ({!r}); specify "
"sampling_rate explicitly")
sampling_rate = 1. / trs.pop()
"sampling_rate explicitly".format(trs))
TR = trs.pop()
if not tas:
warnings.warn("Acquisition time unavailable; assuming TA = TR")
tas = {TR}
elif len(tas) > 1:
raise ValueError("Non-unique acquisition times found ({!r})".format(tas))

sampling_rate = 1. / TR
acquisition_time = tas.pop()
elif sampling_rate == 'highest':
sampling_rate = None
dense_df = coll.to_df(names, format='wide',
include_sparse=include_sparse,
sampling_rate=sampling_rate, **kwargs)
sampling_rate=sampling_rate,
integration_window=acquisition_time,
**kwargs)
if dense_df is not None:
dense_df = dense_df.drop(['onset', 'duration'], axis=1)

Expand Down
5 changes: 2 additions & 3 deletions bids/analysis/tests/test_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@ def sparse_run_variable_with_missing_values():
'duration': [1.2, 1.6, 0.8, 2],
'amplitude': [1, 1, np.nan, 1]
})
run_info = [RunInfo({'subject': '01'}, 20, 2, 'dummy.nii.gz')]
var = SparseRunVariable(
name='var', data=data, run_info=run_info, source='events')
run_info = [RunInfo({'subject': '01'}, 20, 2, 2, 'dummy.nii.gz')]
var = SparseRunVariable(name='var', data=data, run_info=run_info, source='events')
return BIDSRunVariableCollection([var])


Expand Down
10 changes: 5 additions & 5 deletions bids/variables/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,28 @@ class RunNode(Node):
''' Represents a single Run in a BIDS project.

Args:
id (int): The index of the run.
entities (dict): Dictionary of entities for this Node.
image_file (str): The full path to the corresponding nifti image.
duration (float): Duration of the run, in seconds.
repetition_time (float): TR for the run.
task (str): The task name for this run.
acquisition_time (float): TA for the run.
'''

def __init__(self, entities, image_file, duration, repetition_time):
def __init__(self, entities, image_file, duration, repetition_time, acquisition_time=None):
self.image_file = image_file
self.duration = duration
self.repetition_time = repetition_time
self.acquisition_time = acquisition_time or repetition_time
super(RunNode, self).__init__('run', entities)

def get_info(self):

return RunInfo(self.entities, self.duration, self.repetition_time,
self.image_file)
self.acquisition_time, self.image_file)


# Stores key information for each Run.
RunInfo = namedtuple('RunInfo', ['entities', 'duration', 'tr', 'image'])
RunInfo = namedtuple('RunInfo', ['entities', 'duration', 'tr', 'ta', 'image'])


class NodeIndex(Node):
Expand Down
29 changes: 26 additions & 3 deletions bids/variables/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,27 @@ def load_variables(layout, types=None, levels=None, skip_empty=True,
return dataset


def _get_timing_info(img_md):
if 'RepetitionTime' in img_md:
tr = img_md['RepetitionTime']
if 'DelayTime' in img_md:
ta = tr - img_md['DelayTime']
elif 'SliceTiming' in img_md:
slicetimes = sorted(img_md['SliceTiming'])
# a, b ... z
# z = final slice onset, b - a = slice duration
ta = np.round(slicetimes[-1] + slicetimes[1] - slicetimes[0], 3)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth testing for non-uniform slice onsets (and raise a not supported error)? I don't know if that ever happens in practice, but if it does, we should probably fail...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That feels like a validation problem. While we can put those checks in, ad hoc, I think it would make sense to either fail on load for validation problems or be able to insert a little boilerplate check like: validate(slicetimes, 'SliceTiming')

# If the "silence" is <1ms, consider acquisition continuous
if np.abs(tr - ta) < 1e-3:
ta = tr
else:
ta = tr
elif 'VolumeTiming' in img_md:
return NotImplemented
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be NotImplementedError? I think this will fail as written because of the tuple unpacking when _get_timing_info is called, but the user won't have any idea what went wrong.


return tr, ta


def _load_time_variables(layout, dataset=None, columns=None, scan_length=None,
drop_na=True, events=True, physio=True, stim=True,
regressors=True, skip_empty=True, scope='all',
Expand Down Expand Up @@ -146,8 +167,9 @@ def _load_time_variables(layout, dataset=None, columns=None, scan_length=None,
if 'run' in entities:
entities['run'] = int(entities['run'])

tr = layout.get_metadata(img_f, suffix='bold', scope=scope,
full_search=True)['RepetitionTime']
img_md = layout.get_metadata(img_f, suffix='bold', scope=scope,
full_search=True)
tr, ta = _get_timing_info(img_md)

# Get duration of run: first try to get it directly from the image
# header; if that fails, try to get NumberOfVolumes from the
Expand All @@ -167,7 +189,8 @@ def _load_time_variables(layout, dataset=None, columns=None, scan_length=None,
raise ValueError(msg)

run = dataset.get_or_create_node('run', entities, image_file=img_f,
duration=duration, repetition_time=tr)
duration=duration, repetition_time=tr,
acquisition_time=ta)
run_info = run.get_info()

# Process event files
Expand Down
13 changes: 8 additions & 5 deletions bids/variables/kollekshuns.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def _all_dense(self):
for v in self.variables.values()])

def resample(self, sampling_rate=None, variables=None, force_dense=False,
in_place=False, kind='linear'):
in_place=False, kind='linear', **kwargs):
''' Resample all dense variables (and optionally, sparse ones) to the
specified sampling rate.

Expand Down Expand Up @@ -276,7 +276,8 @@ def resample(self, sampling_rate=None, variables=None, force_dense=False,
# None if in_place; no update needed
_var = var.resample(sampling_rate,
inplace=in_place,
kind=kind)
kind=kind,
**kwargs)
if not in_place:
_variables[name] = _var

Expand All @@ -289,6 +290,7 @@ def resample(self, sampling_rate=None, variables=None, force_dense=False,

def to_df(self, variables=None, format='wide', sparse=True,
sampling_rate=None, include_sparse=True, include_dense=True,
integration_window=None,
**kwargs):
''' Merge columns into a single pandas DataFrame.

Expand Down Expand Up @@ -343,9 +345,10 @@ def to_df(self, variables=None, format='wide', sparse=True,
sampling_rate = sampling_rate or self.sampling_rate

# Make sure all variables have the same sampling rate
variables = list(self.resample(sampling_rate, variables,
force_dense=True,
in_place=False).values())
variables = list(
self.resample(sampling_rate, variables,
force_dense=True, in_place=False,
integration_window=integration_window).values())

return super(BIDSRunVariableCollection, self).to_df(variables, format,
**kwargs)
Expand Down
2 changes: 1 addition & 1 deletion bids/variables/tests/test_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def generate_DEV(name='test', sr=20, duration=480):
ent_names = ['task', 'run', 'session', 'subject']
entities = {e: uuid.uuid4().hex for e in ent_names}
image = uuid.uuid4().hex + '.nii.gz'
run_info = RunInfo(entities, duration, 2, image)
run_info = RunInfo(entities, duration, 2, 2, image)
return DenseRunVariable(name='test', values=values, run_info=run_info,
source='dummy', sampling_rate=sr)

Expand Down
82 changes: 73 additions & 9 deletions bids/variables/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,51 @@ def get_duration(self):
''' Return the total duration of the Variable's run(s). '''
return sum([r.duration for r in self.run_info])

def get_sampling_rate(self, target_sr, rounding='up'):
''' Choose sampling rate that evenly bins acquisition and repetition time

Args:
target_sr (float): Approximate sampling rate to target (in Hz)
rounding (str): Round the sampling interval 'up' (default) or 'down' if
``1 / target_sr`` doesn't evenly divide acquisition and repetition
times

Returns:
sampling_rate (float)

'''
assert rounding in ('up', 'down')

# Convention: lowercase in float seconds/Hz, CAPS in int ms
trs = {ri.tr for ri in var.run_info}
tas = {ri.ta for ri in var.run_info}
assert len(trs) == 1
assert len(tas) == 1

TR = int(np.round(1000. * trs.pop()))
TA = int(np.round(1000. * tas.pop()))
# Nothing to do for continuous acquisition
if TR == TA:
return target_sr

TARGET_DT = int(np.round(1000. / target_sr))

# Interval must evenly divide into TR and TA
DT = gcd(TR, TA)
# Find nearest divisor of DT to TARGET_DT
if DT > TARGET_DT:
div, mod = divmod(DT, TARGET_DT)
if mod == 0:
DT = TARGET_DT
elif rounding == 'up':
DT = DT // div
else:
DT = DT // (div + 1)

# Back to Hz
return 1000. / DT


def to_dense(self, sampling_rate):
''' Convert the current sparse column to a dense representation.
Returns: A DenseRunVariable.
Expand Down Expand Up @@ -425,7 +470,7 @@ def _build_entity_index(self, run_info, sampling_rate):
self.timestamps = pd.concat(_timestamps, axis=0, sort=True)
return pd.concat(index, axis=0, sort=True).reset_index(drop=True)

def resample(self, sampling_rate, inplace=False, kind='linear'):
def resample(self, sampling_rate, integration_window=None, inplace=False, kind='linear'):
'''Resample the Variable to the specified sampling rate.

Parameters
Expand All @@ -442,7 +487,7 @@ def resample(self, sampling_rate, inplace=False, kind='linear'):
'''
if not inplace:
var = self.clone()
var.resample(sampling_rate, True, kind)
var.resample(sampling_rate, integration_window, True, kind)
return var

if sampling_rate == self.sampling_rate:
Expand All @@ -453,18 +498,36 @@ def resample(self, sampling_rate, inplace=False, kind='linear'):

self.index = self._build_entity_index(self.run_info, sampling_rate)

x = np.arange(n)
num = len(self.index)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is on me I think, but we could probably use less confusing names than n and num for the old and new lengths.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I can try to rewrite; I was mostly aiming for a minimal diff.


from scipy.interpolate import interp1d
f = interp1d(x, self.values.values.ravel(), kind=kind)
x_new = np.linspace(0, n - 1, num=num)
self.values = pd.DataFrame(f(x_new))
if integration_window is not None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than determine which approach to use based on whether integration_window is passed, I think we should probably always use interpolation for upsampling and decimation for downsampling (we should definitely avoid downsampling via interpolation—that was a pretty bad decision on my part). Actually, I'm not sure integration/averaging is a great solution, as it's sort of a half-assed way to do temporal filtering. If we're going to go down this road, maybe we should just temporally filter out frequencies above half the target sampling rate and then decimate.

In general, I wonder if it makes sense to implement the resampling operations ourselves, or if we should just do it via pandas (or something like traces, which wraps pandas). I would expect that resampling in pandas is probably going to be more efficient than rolling our own solution, while relieving of us of the burden of extensive testing.

from scipy.sparse import lil_matrix
old_times = np.arange(n) / old_sr
new_times = np.arange(num) / sampling_rate
integrator = lil_matrix((num, n), dtype=np.uint8)
count = None
for i, new_time in enumerate(new_times):
cols = (old_times >= new_time) & (old_times < new_time + integration_window)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the integration window be centered on new_time, rather than using it as the lower boundary? Otherwise the value at each downsampled point is basically reading the future, which is probably not what we want.

# This could be determined analytically, but dodging off-by-one errors
if count is None:
count = np.sum(cols)
integrator[i, cols] = 1

old_vals = self.values.values
self.values = pd.DataFrame(integrator.tocsr().dot(old_vals) / count)
else:
from scipy.interpolate import interp1d
x = np.arange(n)
f = interp1d(x, self.values.values.ravel(), kind=kind)
x_new = np.linspace(0, n - 1, num=num)
self.values = pd.DataFrame(f(x_new))

assert len(self.values) == len(self.index)

self.sampling_rate = sampling_rate

def to_df(self, condition=True, entities=True, timing=True, sampling_rate=None):
def to_df(self, condition=True, entities=True, timing=True, sampling_rate=None,
integration_window=None):
'''Convert to a DataFrame, with columns for name and entities.

Parameters
Expand All @@ -480,7 +543,8 @@ def to_df(self, condition=True, entities=True, timing=True, sampling_rate=None):
sampled uniformly). If False, omits them.
'''
if sampling_rate not in (None, self.sampling_rate):
return self.resample(sampling_rate).to_df(condition, entities)
return self.resample(sampling_rate,
integration_window=integration_window).to_df(condition, entities)

df = super(DenseRunVariable, self).to_df(condition, entities)

Expand Down