-
Notifications
You must be signed in to change notification settings - Fork 5
/
CHIL2023.bib
351 lines (344 loc) · 51.4 KB
/
CHIL2023.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
@Proceedings{CHIL-2023,
booktitle = {Proceedings of the Conference on Health, Inference, and Learning},
name = {Conference on Health, Inference, and Learning},
shortname = {CHIL},
editor = {Mortazavi, Bobak J. and Sarker, Tasmie and Beam, Andrew and Ho, Joyce C.},
volume = {209},
year = {2023},
start = {2023-06-22},
end = {2022-06-24},
published = {2023-06-13},
conference_url = {https://www.chilconference.org/},
address = {415 Main Street, Cambridge, MA USA 02142},
conference_number = {4},
}
@InProceedings{Mortazavi23,
title = {Conference on Health, Inference, and Learning (CHIL) 2022},
author = {Mortazavi, Bobak J. and Sarker, Tasmie and Beam, Andrew and Ho, Joyce C.},
pages = {1-5},
abstract = {},
}
%1
@InProceedings{Ali23,
title = {Counterfactually Guided Policy Transfer in Clinical Settings},
author = {Ali, Sarwan and Bello, Babatunde and Chourasia, Prakash and Punathil, Ria Thazhe and Chen, Pin-Yu and Khan, Imdad Ullah and Patterson, Murray},
pages = {6-18},
abstract = {Understanding the host-specificity of different families of viruses
sheds light on the origin of, e.g., SARS-CoV-2, rabies, and other such
zoonotic pathogens in humans. It enables epidemiologists, medical
professionals, and policymakers to curb existing epidemics and prevent
future ones promptly. In the family Coronaviridae (of which SARS-CoV-2
is a member), it is well-known that the spike protein is the point of
contact between the virus and the host cell membrane. On the other
hand, the two traditional mammalian orders, Carnivora (carnivores) and
Chiroptera (bats) are recognized to be responsible for maintaining and
spreading the Rabies Lyssavirus (RABV). We propose Virus2Vec, a
feature-vector representation for viral (nucleotide or amino acid)
sequences that enable vector-space-based machine learning models to
identify viral hosts. Virus2Vec generates numerical feature vectors
for unaligned sequences, allowing us to forego the computationally
expensive sequence alignment step from the pipeline. Virus2Vec
leverages the power of both the \emph{minimizer} and position weight
matrix (PWM) to generate compact feature vectors. Using several
classifiers, we empirically evaluate Virus2Vec on real-world spike
sequences of Coronaviridae and rabies virus sequence data to predict
the host (identifying the reservoirs of infection). Our results
demonstrate that Virus2Vec outperforms the predictive accuracies of
baseline and state-of-the-art methods.},
}
%2
@InProceedings{Liu23,
title = {Adaptive Weighted Multi-View Clustering},
author = {Liu, Shuo Shuo and Lin, Lin},
pages = {19-36},
abstract = {Learning multi-view data is an emerging problem in machine learning research, and nonnegative matrix factorization (NMF) is a popular dimensionality-reduction method for integrating information from multiple views. These views often provide not only consensus but also complementary information.
However, most multi-view NMF algorithms assign equal weight to each view or tune the weight via line search empirically, which can be infeasible without any prior knowledge of the views or computationally expensive.
In this paper, we propose a weighted multi-view NMF (WM-NMF) algorithm. In particular, we aim to address the critical technical gap, which is to learn both view-specific weight and observation-specific reconstruction weight to quantify each view’s information content. The introduced weighting scheme can alleviate unnecessary views' adverse effects and enlarge the positive effects of the important views by assigning smaller and larger weights, respectively. Experimental results confirm the effectiveness and advantages of the proposed algorithm in terms of achieving better clustering performance and dealing with the noisy data compared to the existing algorithms.},
}
%3
@InProceedings{Yoshida23,
title = {Bayesian Active Questionnaire Design for Cause-of-Death Assignment Using Verbal Autopsies},
author = {Yoshida, Toshiya and Fan, Trinity Shuxian and McCormick, Tyler and Wu Zhenke and Li, Zehang Richard},
pages = {37-49},
abstract = {Only about one-third of the deaths worldwide are assigned a medically-certified cause, and understanding the causes of deaths occurring outside of medical facilities is logistically and financially challenging. Verbal autopsy (VA) is a routinely used tool to collect information on cause of death in such settings. VA is a survey-based method where a structured questionnaire is conducted to family members or caregivers of a recently deceased person, and the collected information is used to infer the cause of death. As VA becomes an increasingly routine tool for cause-of-death data collection, the lengthy questionnaire has become a major challenge to the implementation and scale-up of VA interviews as they are costly and time-consuming to conduct. In this paper, we propose a novel active questionnaire design approach that optimizes the order of the questions dynamically to achieve accurate cause-of-death assignment with the smallest number of questions. We propose a fully Bayesian strategy for adaptive question selection that is compatible with any existing probabilistic cause-of-death assignment methods. We also develop an early stopping criterion that fully accounts for the uncertainty in the model parameters. We also propose a penalized score to account for constraints and preferences of existing question structures. We evaluate the performance of our active designs using both synthetic and real data, demonstrating that the proposed strategy achieves accurate cause-of-death assignment using considerably fewer questions than the traditional static VA survey instruments.},
}
%4
@InProceedings{Tang23,
title = {Modeling Multivariate Biosignals With Graph Neural Networks and Structured State Space Models},
author = {Tang, Siyi and Dunnmon, Jared A and Qu Liangqiong and Saab, Khaled K and Baykaner, Tina and Lee-Messer, Christopher and Rubin, Daniel L},
pages = {50-71},
abstract = {Multivariate biosignals are prevalent in many medical domains, such as electroencephalography, polysomnography, and electrocardiography. Modeling spatiotemporal dependencies in multivariate biosignals is challenging due to (1) long-range temporal dependencies and (2) complex spatial correlations between the electrodes. To address these challenges, we propose representing multivariate biosignals as time-dependent graphs and introduce \textsc{GraphS4mer}, a general graph neural network (GNN) architecture that improves performance on biosignal classification tasks by modeling spatiotemporal dependencies in biosignals. Specifically, (1) we leverage the Structured State Space architecture, a state-of-the-art deep sequence model, to capture long-range temporal dependencies in biosignals and (2) we propose a graph structure learning layer in \textsc{GraphS4mer} to learn dynamically evolving graph structures in the data. We evaluate our proposed model on three distinct biosignal classification tasks and show that \textsc{GraphS4mer} consistently improves over existing models, including (1) seizure detection from electroencephalographic signals, outperforming a previous GNN with self-supervised pre-training by 3.1 points in AUROC; (2) sleep staging from polysomnographic signals, a 4.1 points improvement in macro-F1 score compared to existing sleep staging models; and (3) 12-lead electrocardiogram classification, outperforming previous state-of-the-art models by 2.7 points in macro-F1 score.},
}
%5
@InProceedings{Wu23a,
title = {Token Imbalance Adaptation for Radiology Report Generation},
author = {Wu, Yuexin and Huang, I-Chan, and Huang, Xiaolei},
pages = {72-85},
abstract = {Imbalanced token distributions naturally exist in text documents, leading neural language models to overfit on frequent tokens.
The token imbalance may dampen the robustness of radiology report generators, as complex medical terms appear less frequently but reflect more medical information. In this study, we demonstrate how current state-of-the-art models fail to generate infrequent tokens on two standard benchmark datasets (IU X-RAY and MIMIC-CXR) of radiology report generation. To solve the challenge, we propose the \textbf{T}oken \textbf{Im}balance Adapt\textbf{er} (\textit{TIMER}), aiming to improve generation robustness on infrequent tokens. The model automatically leverages token imbalance by an unlikelihood loss and dynamically optimizes generation processes to augment infrequent tokens. We compare our approach with multiple state-of-the-art methods on the two benchmarks. Experiments demonstrate the effectiveness of our approach in enhancing model robustness overall and infrequent tokens. Our ablation analysis shows that our reinforcement learning method has a major effect in adapting token imbalance for radiology report generation.
},
}
%6
@InProceedings{Chen23a,
title = {Missing Values and Imputation in Healthcare Data: Can Interpretable Machine Learning Help?},
author = {Chen, Zhi and Tan, Sarah and Chajewska, Urszula and Rudin, Cynthia and Caruna, Rich},
pages = {86-99},
abstract = {Missing values are a fundamental problem in data science. Many datasets have missing values that must be properly handled because the way missing values are treated can have large impact on the resulting machine learning model. In medical applications, the consequences may affect healthcare decisions. There are many methods in the literature for dealing with missing values, including state-of-the-art methods which often depend on black-box models for imputation. In this work, we show how recent advances in interpretable machine learning provide a new perspective for understanding and tackling the missing value problem. We propose methods based on high-accuracy glass-box Explainable Boosting Machines (EBMs) that can help users (1) gain new insights on missingness mechanisms and better understand the causes of missingness, and (2) detect -- or even alleviate -- potential risks introduced by imputation algorithms. Experiments on real-world medical datasets illustrate the effectiveness of the proposed methods.},
}
%7
@InProceedings{Kim23,
title = {Revisiting Machine-Learning based Drug Repurposing: Drug Indications Are Not a Right Prediction Target},
author = {Kim, Siun and Won, Jung-Hyun and Lee, David Seung U and Luo, Renqian and Wu, Lijun and Xia, Yingce and Qin, Tao and Lee, Howard},
pages = {100-116},
abstract = {In this paper, we challenge the utility of approved drug indications as a prediction target for machine learning in drug repurposing (DR) studies. Our research highlights two major limitations of this approach: 1) the presence of strong confounding between drug indications and drug characteristics data, which results in shortcut learning, and 2) inappropriate normalization of indications in existing drug-disease association (DDA) datasets, which leads to an overestimation of model performance. We show that the collection patterns of drug characteristics data were similar within drugs of the same category and the Anatomical Therapeutic Chemical (ATC) classification of drugs could be predicted by using the data collection patterns. Furthermore, we confirm that the performance of existing DR models is significantly degraded in the realistic evaluation setting we proposed in this study. We provide realistic data split information for two benchmark datasets, Fdataset and deepDR dataset.},
}
%8
@InProceedings{Xu23,
title = {Modeling Multivariate Biosignals With Graph Neural Networks and Structured State Space Models},
author = {Xu, Li and Liu, Bo and Khan, Ameer Hamza and Fan, Lu and Wu, Xiao-Ming},
pages = {117-132},
abstract = {With the availability of large-scale, comprehensive, and general-purpose vision-language (VL) datasets such as MSCOCO, vision-language pre-training (VLP) has become an active area of research and proven to be effective for various VL tasks such as visual-question answering.
However, studies on VLP in the medical domain have so far been scanty. To provide a comprehensive perspective on VLP for medical VL tasks, we conduct a thorough experimental analysis to study key factors that may affect the performance of VLP with a unified vision-language Transformer.
To allow making sound and quick pre-training decisions, we propose RadioGraphy Captions (RGC), a high-quality, multi-modality radiographic dataset containing 18,434 image-caption pairs collected from an open-access online database MedPix. RGC can be used as a pre-training dataset or a new benchmark for medical report generation and medical image-text retrieval.
By utilizing RGC and other available datasets for pre-training, we develop several key insights that can guide future medical VLP research and new strong baselines for various medical VL tasks.},
}
%9
@InProceedings{Gao23,
title = {\ours: Mobile Sensing based Fluid Overload Detection for End Stage Kidney Disease Patients using \underline{S}ensor \underline{R}elation \underline{D}ual \underline{A}utoencoder},
author = {Tang, Mingyu and Gao, Jiechao and Dong, Guimin and Yang, Carl and Campbell, Bradford and Bowman, Brendan
and Zoellner, Jamie Marie and Abdel-Rahman, Emaad and Boukhechba, Mehdi},
pages = {133-146},
abstract = {Chronic kidney disease (CKD) is a life-threatening and prevalent disease. CKD patients, especially end-stage kidney disease (ESKD) patients on hemodialysis, suffer from kidney failures and are unable to remove excessive fluid, causing fluid overload and multiple morbidities including death. Current solutions for fluid overtake monitoring such as ultrasonography and biomarkers assessment are cumbersome, discontinuous, and can only be performed in the clinic. In this paper, we propose \ours, a latent graph learning powered fluid overload detection system based on \underline{S}ensor \underline{R}elation \underline{D}ual \underline{A}utoencoder to detect excessive fluid consumption of EKSD patients based on passively collected bio-behavioral data from smartwatch sensors. Experiments using real-world mobile sensing data indicate that \ours outperforms the state-of-the-art baselines in both F1 score and recall, and demonstrate the potential of ubiquitous sensing for ESKD fluid intake management.\looseness=-1},
}
%10
@InProceedings{Manoel23,
title = {Federated Multilingual Models for Medical Transcript Analysis},
author = {Manoel, Andrea and Garcia, Mirian del Carmen Hipolito and Baumel, Tal and Su, Shize and Chen, Jialei and Sim, Robert and Miller, Dan and Karmon, Danny and Dimitriadis, Dimitrios},
pages = {147-162},
abstract = {Federated Learning (FL) is a machine learning approach that allows the model trainer to access more data samples by training across multiple decentralized data sources while enforcing data access constraints. Such trained models can achieve significantly higher performance beyond what can be done when trained on a single data source. In a FL setting, none of the training data is ever transmitted to any central location; i.e.\ sensitive data remains local and private. These characteristics make FL perfectly suited for applications in healthcare, where a variety of compliance constraints restrict how data may be handled. Despite these apparent benefits in compliance and privacy, certain scenarios such as heterogeneity of the local data distributions pose significant challenges for FL. Such challenges are even more pronounced in the case of a multilingual setting. This paper presents a FL system for pre-training a large-scale multi-lingual model suitable for fine-tuning on downstream tasks such as medical entity tagging. Our work represents one of the first such production-scale systems, capable of training across multiple highly heterogeneous data providers, and achieving levels of accuracy that could not be otherwise achieved by using central training with public data only. We also show that the global model performance can be further improved by a local training step.},
}
%11
@InProceedings{Hwang23,
title = {Towards the Practical Utility of Federated Learning in the Medical Domain},
author = {Hwang, Hyeonji and Yang, Seongjun and Kim, Daeyoung and Dua, Radhika and Kim, Jong-Yeup and Yang, Eunho and Choi, Edward},
pages = {163-181},
abstract = {Federated learning (FL) is an active area of research. One of the most suitable areas for adopting FL is the medical domain, where patient privacy must be respected. Previous research, however, does not provide a practical guide to applying FL in the medical domain. We propose empirical benchmarks and experimental settings for three representative medical datasets with different modalities: longitudinal electronic health records, skin cancer images, and electrocardiogram signals. The likely users of FL such as medical institutions and IT companies can take these benchmarks as guides for adopting FL and minimize their trial and error. For each dataset, each client data is from a different source to preserve real-world heterogeneity. We evaluate six FL algorithms designed for addressing data heterogeneity among clients, and a hybrid algorithm combining the strengths of two representative FL algorithms. Based on experiment results from three modalities, we discover that simple FL algorithms tend to outperform more sophisticated ones, while the hybrid algorithm consistently shows good, if not the best performance. We also find that a frequent global model update leads to better performance under a fixed training iteration budget. As the number of participating clients increases, higher cost is incurred due to increased IT administrators and GPUs, but the performance consistently increases. We expect future users will refer to these empirical benchmarks to design the FL experiments in the medical domain considering their clinical tasks and obtain stronger performance with lower costs.},
}
%12
@InProceedings{Cina23,
title = {Semantic match: Debugging feature attribution methods \titlebreak in XAI for healthcare},
author = {Cina, Giovanni and Rober, Tabea E and Goedhard, Rob and Birbil, S Ilker},
pages = {182-190},
abstract = {The recent spike in certified Artificial Intelligence tools for healthcare has renewed the debate around adoption of this technology. One thread of such debate concerns Explainable AI and its promise to render AI devices more transparent and trustworthy. A few voices active in the medical AI space have expressed concerns on the reliability of Explainable AI techniques and especially feature attribution methods, questioning their use and inclusion in guidelines and standards. We characterize the problem as a lack of semantic match between explanations and human understanding. To understand when feature importance can be used reliably, we introduce a distinction between feature importance of low- and high-level features. We argue that for data types where low-level features come endowed with a clear semantics, such as tabular data like Electronic Health Records, semantic match can be obtained, and thus feature attribution methods can still be employed in a meaningful and useful way. For high-level features, we sketch a procedure to test whether semantic match has been achieved.},
}
%13
@InProceedings{Merrill23a,
title = {Self-Supervised Pretraining and Transfer Learning Enable\titlebreak Flu and COVID-19 Predictions in Small Mobile Sensing Datasets},
author = {Merrill, Mika A and Althoff, Tim},
pages = {191-206},
abstract = {Detailed mobile sensing data from phones and fitness trackers offer an opportunity to quantify previously unmeasurable behavioral changes to improve individual health and accelerate responses to emerging diseases. Unlike in natural language processing and computer vision, deep learning has yet to broadly impact this domain, in which the majority of research and clinical applications still rely on manually defined features or even forgo predictive modeling altogether due to insufficient accuracy. This is due to unique challenges in the behavioral health domain, including
very small datasets ($\sim \!\! 10^1$ participants), which frequently contain missing data, consist of long time series with critical long-range dependencies (length$<10^4$), and extreme class imbalances ($>10^3$:1). Here, we \new{describe} a neural architecture for multivariate time series classification designed to address these unique domain challenges. Our proposed behavioral representation learning approach combines novel tasks for self-supervised pretraining and transfer learning to address data scarcity, and captures long-range dependencies across long-history time series through transformer self-attention following convolutional neural network-based dimensionality reduction. We propose an evaluation framework aimed at reflecting expected real-world performance in plausible deployment scenarios. Concretely, we demonstrate (1) performance improvements over baselines of up to 0.15 ROC AUC across five influenza-related prediction tasks, (2) transfer learning-induced performance improvements \new{including a 16\% relative increase} in PR AUC in small data scenarios, and (3) the potential of transfer learning in novel disease scenarios through an exploratory case study of zero-shot COVID-19 prediction in an independent data set. Finally, we discuss potential implications for medical surveillance testing.},
}
%14
@InProceedings{Merrill23b,
title = {Homekit2020: A Benchmark for Time Series Classification on a Large Mobile Sensing Dataset with Laboratory Tested Ground Truth of Influenza Infections},
author = {Merrill, Mike A and Safranchik, Esteban and Kolbeinsson, Arinbj\"orn and Gade, Piyusha and Ramirez, Ernesto and Schmidt, Ludwig and Foshchini, Luca and Althoff, Tim},
pages = {207-228},
abstract = {Despite increased interest in wearables as tools for detecting various health conditions, there are not as of yet any large public benchmarks for such mobile sensing data. The few datasets that \textit{are} available do not contain data from more than dozens of individuals, do not contain high-resolution raw data or do not include dataloaders for easy integration into machine learning pipelines. Here, we present Homekit2020: the first large-scale public benchmark for time series classification of wearable sensor data. Our dataset contains over 14 million hours of minute-level multimodal Fitbit data, symptom reports, and ground-truth laboratory PCR influenza test results, along with an evaluation framework that mimics realistic model deployments and efficiently characterizes statistical uncertainty in model selection in the presence of extreme class imbalance. Furthermore, we implement and evaluate nine neural and non-neural time series classification models on our benchmark across 450 total training runs in order to establish state of the art performance.},
}
%15
@InProceedings{Wu23b,
title = {Collecting data when missingness is unknown: a method for improving model performance given under-reporting in patient populations},
author = {Wu, Kevin and Dahlem, Dominik and Hane, Christopher and Halperin, Eran and Zou, James},
pages = {229-242},
abstract = {Machine learning models for healthcare commonly use binary indicator variables to represent the diagnosis of specific health conditions in medical records. However, in populations with significant under-reporting, the absence of a recorded diagnosis does not rule out the presence of a condition, making it difficult to distinguish between negative and missing values. This effect, which we refer to as latent missingness, may lead to model degradation and perpetuate existing biases in healthcare. To address this issue, we propose that healthcare providers and payers allocate a budget towards data collection (eg. subsidies for check-ups or lab tests). However, given finite resources, only a subset of data points can be collected. Additionally, most models are unable to be re-trained after deployment. In this paper, we propose a method for efficient data collection in order to maximize a fixed model's performance on a given population. Through simulated and real-world data, we demonstrate the potential value of targeted data collection to address model degradation.},
}
%16
@InProceedings{Ji23,
title = {Large-Scale Study of Temporal Shift in Health Insurance Claims},
author = {Ji, Christina X and Alaa, Ahmed M and Sontag, David},
pages = {243-278},
abstract = {Most machine learning models for predicting clinical outcomes are developed using historical data. Yet, even if these models are deployed in the near future, dataset shift over time may result in less than ideal performance. To capture this phenomenon, we consider a task---that is, an outcome to be predicted at a particular time point---to be non-stationary if a historical model is no longer optimal for predicting that outcome. We build an algorithm to test for temporal shift either at the population level or within a discovered sub-population. Then, we construct a meta-algorithm to perform a retrospective scan for temporal shift on a large collection of tasks. Our algorithms enable us to perform the first comprehensive evaluation of temporal shift in healthcare to our knowledge. We create 1,010 tasks by evaluating 242 healthcare outcomes for temporal shift from 2015 to 2020 on a health insurance claims dataset. 9.7\% of the tasks show temporal shifts at the population level, and 93.0\% have some sub-population affected by shifts. We dive into case studies to understand the clinical implications. Our analysis highlights the widespread prevalence of temporal shifts in healthcare.},
}
%17
@InProceedings{Pillai23,
title = {Rare Life Event Detection via Mobile Sensing Using Multi-Task Learning},
author = {Pillai, Arvind and Nepal, Subigya and Campbell, Andrew},
pages = {279-293},
abstract = {Rare life events significantly impact mental health, and their detection in behavioral studies is a crucial step towards health-based interventions. We envision that mobile sensing data can be used to detect these anomalies. However, the human-centered nature of the problem, combined with the infrequency and uniqueness of these events makes it challenging for unsupervised machine learning methods. In this paper, we first investigate granger-causality between life events and human behavior using sensing data. Next, we propose a multi-task framework with an unsupervised autoencoder to capture irregular behavior, and an auxiliary sequence predictor that identifies transitions in workplace performance to contextualize events. We perform experiments using data from a mobile sensing study comprising N=126 information workers from multiple industries, spanning 10106 days with 198 rare events ($<2\%$). Through personalized inference, we detect the exact day of a rare event with an F1 of 0.34, demonstrating that our method outperforms several baselines. Finally, we discuss the implications of our work from the context of real-world deployment.},
}
%18
@InProceedings{Cho23,
title = {Rediscovery of CNN's Versatility for Text-based Encoding of Raw Electronic Health Records},
author = {Lee, Minjae and Hur, Kyunghoon and Kim, Jiyoun and Yoon, Jinsung and Choi, Edward},
pages = {294-313},
abstract = {Making the most use of abundant information in electronic health records (EHR) is rapidly becoming an important topic in the medical domain.
Recent work presented a promising framework that embeds entire features in raw EHR data regardless of its form and medical code standards.
The framework, however, only focuses on encoding EHR with minimal preprocessing and fails to consider how to learn efficient EHR representation in terms of computation and memory usage.
In this paper, we search for a versatile encoder not only reducing the large data into a manageable size but also well preserving the core information of patients to perform diverse clinical tasks.
We found that hierarchically structured Convolutional Neural Network (CNN) often outperforms the state-of-the-art model on diverse tasks such as reconstruction, prediction, and generation, even with fewer parameters and less training time.
Moreover, it turns out that making use of the inherent hierarchy of EHR data can boost the performance of any kind of backbone models and clinical tasks performed.
Through extensive experiments, we present concrete evidence to generalize our research findings into real-world practice.
We give a clear guideline on building the encoder based on the research findings captured while exploring numerous settings.},
}
%19
@InProceedings{Jin23,
title = {Clinical Relevance Score for Guided Trauma Injury Pattern Discovery with Weakly Supervised $\beta$-VAE},
author = {Jin, Qixuan and Oosterhoff, Jacobien HF and Huang, Yepeng and Ghassemi, Marzyeh and Brat, Gabriel A},
pages = {314-339},
abstract = {Given the complexity of trauma presentations, particularly in those involving multiple areas of the body, overlooked injuries are common during the initial assessment by a clinician.
We are motivated to develop an automated trauma pattern discovery framework for comprehensive identification of injury patterns which may eventually support diagnostic decision-making.
We analyze 1,162,399 patients from the Trauma Quality Improvement Program with a disentangled variational autoencoder, weakly supervised by a latent-space classifier of auxiliary features. We also develop a novel scoring metric that serves as a proxy for clinical intuition in extracting clusters with clinically meaningful injury patterns. We validate the extracted clusters with clinical experts, and explore the patient characteristics of selected groupings.
Our metric is able to perform model selection and effectively filter clusters for clinically-validated relevance. },
}
%20
@InProceedings{Hardy23,
title = {Who Controlled the Evidence? Question Answering for Disclosure Information Extraction},
author = {Hardy Hardy and Derek Ruths and Nicholas B. King},
pages = {340-349},
abstract = {Conflict of interest (COI) disclosure statements provide rich information to support transparency and reduce bias in research. We introduce a novel task to identify relationships between sponsoring entities and the research studies they sponsor from the disclosure statement. This task is challenging due to the complexity of recognizing all potential relationship patterns and the hierarchical nature of identifying entities first and then extracting their relationships to the study. To overcome these challenges, in this paper, we also constructed a new annotated dataset and proposed a Question Answering-based method to recognize entities and extract relationships. Our method has demonstrated robustness in handling diverse relationship patterns, and it remains effective even when trained on a low-resource dataset.}
}
%21
@InProceedings{Lacava23,
title = {Fair admission risk prediction with proportional multicalibration},
author = {La Cava, William G and Lett, Elle and Wan, Guangya},
pages = {350-378},
abstract = {Fair calibration is a widely desirable fairness criteria in risk prediction contexts.
One way to measure and achieve fair calibration is with multicalibration.
Multicalibration constrains calibration error among flexibly-defined subpopulations while maintaining overall calibration.
However, multicalibrated models can exhibit a higher percent calibration error among groups with lower base rates than groups with higher base rates.
As a result, it is possible for a decision-maker to learn to trust or distrust model predictions for specific groups.
To alleviate this, we propose \emph{proportional multicalibration}, a criteria that constrains the percent calibration error among groups and within prediction bins.
We prove that satisfying proportional multicalibration bounds a model's multicalibration as well its \emph{differential calibration}, a fairness criteria that directly measures how closely a model approximates sufficiency.
Therefore, proportionally calibrated models limit the ability of decision makers to distinguish between model performance on different patient groups, which may make the models more trustworthy in practice.
We provide an efficient algorithm for post-processing risk prediction models for proportional multicalibration and evaluate it empirically.
We conduct simulation studies and investigate a real-world application of PMC-postprocessing to prediction of emergency department patient admissions.
We observe that proportional multicalibration is a promising criteria for controlling simultaneous measures of calibration fairness of a model over intersectional groups with virtually no cost in terms of classification performance. }
}
%22
@InProceedings{Jeanselme23,
title = {Neural Fine-Gray: Monotonic neural networks for competing risks},
author = {Jeanselme, Vincent and Yoon, Chang Ho and Tom, Brian and Barrett, Jessica},
pages = {379-392},
abstract = {Time-to-event modelling, known as survival analysis, differs from standard regression as it addresses \emph{censoring} in patients who do not experience the event of interest. Despite competitive performances in tackling this problem, machine learning methods often ignore other \emph{competing risks} that preclude the event of interest. This practice biases the survival estimation. Extensions to address this challenge often rely on parametric assumptions or numerical estimations leading to sub-optimal survival approximations. This paper leverages constrained monotonic neural networks to model each competing survival distribution. This modelling choice ensures the exact likelihood maximisation at a reduced computational cost by using automatic differentiation. The effectiveness of the solution is demonstrated on one synthetic and three medical datasets. Finally, we discuss the implications of considering competing risks when developing risk scores for medical practice.},
}
%23
@InProceedings{Rubin-falcone23,
title = {Denoising Autoencoders for Learning from Noisy Patient-Reported Data},
author = {Rubin-Falcone, Harry and Lee, Joyce M. and Wiens, Jenna},
pages = {393-409},
abstract = {Healthcare datasets often include patient-reported values, such as mood, symptoms, and meals, which can be subject to varying levels of human error. Improving the accuracy of patient-reported data could help in several downstream tasks, such as remote patient monitoring. In this study, we propose a novel denoising autoencoder (DAE) approach to denoise patient-reported data, drawing inspiration from recent work in computer vision. Our approach is based on the observation that noisy patient-reported data are often collected alongside higher fidelity data collected from wearable sensors. We leverage these auxiliary data to improve the accuracy of the patient-reported data. Our approach combines key ideas from DAEs with co-teaching to iteratively filter and learn from clean patient-reported samples. Applied to the task of recovering carbohydrate values for blood glucose management in diabetes, our approach reduces noise (MSE) in patient-reported carbohydrates from 72$g^2$ (95\% CI: 54-93) to 18$g^2$ (13-25), outperforming the best baseline (33$g^2$ (27-43)). Notably, our approach achieves strong performance with only access to patient-reported target values, making it applicable to many settings where ground truth data may be unavailable. },
}
%24
@InProceedings{Matton23,
title = {Contrastive Learning of Electrodermal Activity Representations for Stress Detection},
author = {Matton, Katie and Lewis, Robert and Guttag, John and Picard, Rosalind},
pages = {410-426},
abstract = {Electrodermal activity (EDA) is a biosignal that contains valuable information for monitoring health conditions related to sympathetic nervous system activity. Analyzing ambulatory EDA data is challenging because EDA measurements tend to be noisy and sparsely labeled. To address this problem, we present the first study of contrastive learning that examines approaches that are tailored to the EDA signal. We present a novel set of data augmentations that are tailored to EDA, and use them to generate positive examples for unsupervised contrastive learning. We evaluate our proposed approach on the downstream task of stress detection. We find that it outperforms baselines when used both for fine-tuning and for transfer learning, especially in regimes of high label sparsity. We verify that our novel EDA-specific augmentations add considerable value beyond those considered in prior work through a set of ablation experiments.},
}
%25
@InProceedings{Wang23,
title = {Machine Learning for Arterial Blood Pressure Prediction},
author = {Zheng, Jessica and Wang, Hanrui and Chandrasekhar, Anand and Aguirre, Aaron D and Han, Song and Lee, Hae-Seung and Sodini, Charles G},
pages = {427-439},
abstract = {High blood pressure is a major risk factor for cardiovascular disease, necessitating accurate blood pressure (BP) measurement. Clinicians measure BP with an invasive arterial catheter or via a non-invasive arm or finger cuff. However, the former can cause discomfort to the patient and is unsuitable outside Intensive Care Unit (ICU). While cuff-based devices, despite being non-invasive, fails to provide continuous measurement, and they measure from peripheral blood vessels whose BP waveforms differ significantly from those proximal to the heart. Hence, there is an urgent need to develop a measurement protocol for converting easily measured non-invasive data into accurate BP values. Addressing this gap, we propose a non-invasive approach to predict BP from arterial area and blood flow velocity signals measured from a Philips ultrasound transducer (XL-143) applied to large arteries close to heart. We developed the protocol and collected data from 72 subjects. The shape of BP (relative BP) can be theoretically calculated from these waveforms, however there is no established theory to obtain \textit{absolute} BP values. To tackle this challenge, we further employ data-driven machine learning models to predict the Mean Arterial Blood Pressure (MAP), from which the absolute BP can be derived. Our study investigates various machine learning algorithms to optimize the prediction accuracy. We find that LSTM, Transformer, and 1D-CNN algorithms using the blood pressure shape and blood flow velocity waveforms as inputs can achieve 8.6, 8.7, and 8.8 mmHg average standard deviation of the prediction error respectively without anthropometric data such as age, sex, heart rate, height, weight. Furthermore, the 1D-CNN model can achieve 7.9mmHg when anthropometric data is added as inputs, improving upon an anthropometric-only model of 9.5mmHg. This machine learning-based approach, capable of converting ultrasound data into MAP values, presents a promising software tool for physicians in clinical decision-making regarding blood pressure management.
},
}
%26
@InProceedings{Chen23b,
title = {A General Framework for Visualizing Embedding Spaces of\titlebreak Neural Survival Analysis Models Based on Angular Information},
author = {Chen, George H},
pages = {440-476},
abstract = {We propose a general framework for visualizing any intermediate embedding representation used by any neural survival analysis model. Our framework is based on so-called \emph{anchor directions} in an embedding space. We show how to estimate these anchor directions using clustering or, alternatively, using user-supplied ``concepts'' defined by collections of raw inputs (e.g., feature vectors all from female patients could encode the concept ``female''). For tabular data, we present visualization strategies that reveal how anchor directions relate to raw clinical features and to survival time distributions. We then show how these visualization ideas extend to handling raw inputs that are images. Our framework is built on looking at angles between vectors in an embedding space, where there could be ``information loss'' by ignoring magnitude information. We show how this loss results in a ``clumping'' artifact that appears in our visualizations, and how to reduce this information loss in practice.},
}
%27
@InProceedings{Tjandra23,
title = {Leveraging an Alignment Set in Tackling Instance-Dependent Label Noise},
author = {Tjandra, Donna and Wiens, Jenna},
pages = {477-497},
abstract = {Noisy training labels can hurt model performance. Most approaches that aim to address label noise assume label noise is independent from the input features. In practice, however, label noise is often feature or \textit{instance-dependent}, and therefore biased (i.e., some instances are more likely to be mislabeled than others). E.g., in clinical care, female patients are more likely to be under-diagnosed for cardiovascular disease compared to male patients. Approaches that ignore this dependence can produce models with poor discriminative performance, and in many healthcare settings, can exacerbate issues around health disparities. In light of these limitations, we propose a two-stage approach to learn in the presence instance-dependent label noise. Our approach utilizes \textit{\anchor points}, a small subset of data for which we know the observed and ground truth labels. On several tasks, our approach leads to consistent improvements over the state-of-the-art in discriminative performance (AUROC) while mitigating bias (area under the equalized odds curve, AUEOC). For example, when predicting acute respiratory failure onset on the MIMIC-III dataset, our approach achieves a harmonic mean (AUROC and AUEOC) of 0.84 (SD [standard deviation] 0.01) while that of the next best baseline is 0.81 (SD 0.01). Overall, our approach improves accuracy while mitigating potential bias compared to existing approaches in the presence of instance-dependent label noise.},
}
%28
@InProceedings{Zhou23,
title = {Evaluating Model Performance in Medical Datasets Over Time},
author = {Zhou, Helen and Chen, Yuwen and Lipton, Zachary},
pages = {498-508},
abstract = {Machine learning (ML) models deployed in healthcare systems
must face data drawn from continually evolving environments.
However,
researchers proposing
such models typically evaluate them
in a time-agnostic manner, splitting datasets
according to patients sampled randomly
throughout the entire study time period.
This work proposes the Evaluation on Medical Datasets Over Time (EMDOT) framework,
which evaluates the performance of a model class across time.
Inspired by the concept of backtesting,
EMDOT simulates possible training procedures that practitioners
might have been able to execute at each point in time
and evaluates the resulting models on all future time points.
Evaluating both linear and more complex models on six distinct medical data sources (tabular and imaging), we
%
show how depending on the dataset,
using all historical data may be ideal in many cases,
whereas using a window of the most recent data could be advantageous in others.
In datasets
%
where models suffer from sudden degradations in performance,
we investigate plausible explanations for these shocks.
We release the EMDOT package to help facilitate
further works in deployment-oriented evaluation over time.},
}
%29
@InProceedings{Deznabi23,
title = {MultiWave: Multiresolution Deep Architectures through Wavelet Decomposition for Multivariate Time Series Prediction},
author = {Deznabi, Iman and Fiterau, Madalina},
pages = {509-525},
abstract = {The analysis of multivariate time series data is challenging due to the various frequencies of signal changes that can occur over both short and long terms. Furthermore, standard deep learning models are often unsuitable for such datasets, as signals are typically sampled at different rates. To address these issues, we introduce MultiWave, a novel framework that enhances deep learning time series models by incorporating components that operate at the intrinsic frequencies of signals. MultiWave uses wavelets to decompose each signal into subsignals of varying frequencies and groups them into frequency bands. Each frequency band is handled by a different component of our model. A gating mechanism combines the output of the components to produce sparse models that use only specific signals at specific frequencies. Our experiments demonstrate that MultiWave accurately identifies informative frequency bands and improves the performance of various deep learning models, including LSTM, Transformer, and CNN-based models, for a wide range of applications. It attains top performance in stress and affect detection from wearables. It also increases the AUC of the best-performing model by 5\% for in-hospital COVID-19 mortality prediction from patient blood samples and for human activity recognition from accelerometer and gyroscope data. We show that MultiWave consistently identifies critical features and their frequency components, thus providing valuable insights into the applications studied.},
}
%30
@InProceedings{Yang23,
title = {\ours: Pre-Train Graph Neural Networks for Brain Network Analysis},
author = {Yang, Yi and Cui, Hejie and Yang, Carl},
pages = {526-544},
abstract = {The human brain is the central hub of the neurobiological system, controlling behavior and cognition in complex ways. Recent advances in neuroscience and neuroimaging analysis have shown a growing interest in the interactions between brain regions of interest (ROIs) and their impact on neural development and disorder diagnosis. As a powerful deep model for analyzing graph-structured data, Graph Neural Networks (GNNs) have been applied for brain network analysis. However, training deep models requires large amounts of labeled data, which is often scarce in brain network datasets due to the complexities of data acquisition and sharing restrictions.
To make the most out of available training data, we propose \ours, a GNN pre-training framework that captures intrinsic brain network structures, regardless of clinical outcomes, and is easily adaptable to various downstream tasks. \ours comprises two key components: (1) an unsupervised pre-training technique designed specifically for brain networks, which enables learning from large-scale datasets without task-specific labels; (2) a data-driven parcellation atlas mapping pipeline that facilitates knowledge transfer across datasets with different ROI systems.
Extensive evaluations using various GNN models have demonstrated the robust and superior performance of \ours compared to baseline methods.},
}
%31
@InProceedings{Vodrahalli23,
title = {Understanding and Predicting the Effect of Environmental Factors on People with Type 2 Diabetes},
author = {Vodrahalli, Kailas and Lyng, Gregory D and Hill, Brian L and Karkkainen, Kimmo and Hertzberg, Jeffrey and Zou, James and Halperin, Eran},
pages = {545-555},
abstract = {Type 2 diabetes mellitus (T2D) affects over 530 million people globally and is often difficult to manage leading to serious health complications. Continuous glucose monitoring (CGM) can help people with T2D to monitor and manage the disease. CGM devices sample an individual's glucose level at frequent intervals enabling sophisticated characterization of an individual's health. In this work, we leverage a large dataset of CGM data (5,447 individuals and 940,663 days of data) paired with health records and activity data to investigate how glucose levels in people with T2D are affected by external factors like weather conditions, extreme weather events, and temporal events including local holidays. We find temperature (p=$2.37\times10^{-8}$, n=3561), holidays (p=$2.23\times10^{-46}$, n=4079), and weekends (p=$7.64\times10^{-124}$, n=5429) each have a significant effect on standard glycemic metrics at a population level. Moreover, we show that we can predict whether an individual will be significantly affected by a (potentially unobserved) external event using only demographic information and a few days of CGM and activity data. Using random forest classifiers, we can predict whether an individual will be more negatively affected than a typical individual with T2D by a given external factor with respect to a given glycemic metric. We find performance (measured as ROC-AUC) is consistently above chance (across classifiers, median ROC-AUC=0.63). Performance is highest for classifiers predicting the effect of time-in-range (median ROC-AUC=0.70). These are important findings because they may enable better patient care management with day-to-day risk assessments based on external factors as well as improve algorithm development by reducing train- and test-time bias due to external factors.},
}
%32
@InProceedings{Nagesh23,
title = {Explaining a machine learning decision to physicians via counterfactuals},
author = {Nagesh, Supriya and Mishra, Nina and Naamad, Yonatan and Rehg, James M and Shah, Mehul A and Wagner, Alexei},
pages = {556-577},
abstract = {Machine learning models perform well on several healthcare tasks and can help reduce the burden on the healthcare system. However, the lack of explainability is a major roadblock to their adoption in hospitals.
\textit{How can the decision of an ML model be explained to a physician?}
The explanations considered in this paper are counterfactuals (CFs), hypothetical scenarios that would have resulted in the opposite outcome. Specifically, time-series CFs are investigated, inspired by the way physicians converse and reason out decisions `I would have given the patient a vasopressor if their blood pressure was lower and falling'.
Key properties of CFs that are particularly meaningful in clinical settings are outlined: physiological plausibility, relevance to the task and sparse perturbations.
Past work on CF generation does not satisfy these properties, specifically plausibility in that realistic time-series CFs are not generated.
A variational autoencoder (VAE)-based approach is proposed that captures these desired properties.
The method produces CFs that improve on prior approaches quantitatively (more plausible CFs as evaluated by their likelihood w.r.t original data distribution, and 100$\times$ faster at generating CFs)
and qualitatively (2$\times$ more plausible and relevant)
as evaluated by three physicians.},
}
%33
@InProceedings{Lehman23,
title = {Do We Still Need Clinical Language Models?},
author = {Lehman< eric and Hernandez, Evan and Mahajan, Diwakar and Wulff, Jonas and Smith, Micah J and Ziegler, Zachary and Nadler, Daniel and Szolovits, Peter and Johnson, Alistair and Alsentzer, Emily},
pages = {578-597},
abstract = {Although recent advances in scaling large language models (LLMs) have resulted in improvements on many NLP tasks, it remains unclear whether these models trained primarily with general web text are the right tool in highly specialized, safety critical domains such as \emph{clinical text}.
Recent results have suggested that LLMs encode a surprising amount of medical knowledge.
This raises an important question regarding the utility of smaller domain-specific language models.
With the success of general-domain LLMs, is there still a need for specialized clinical models?
To investigate this question, we conduct an extensive empirical analysis of 12 language models, ranging from 220M to 175B parameters, measuring their performance on 3 different clinical tasks that test their ability to parse and reason over electronic health records.
As part of our experiments, we train T5-Base and T5-Large models from scratch on clinical notes from MIMIC III and IV to directly investigate the efficiency of clinical tokens.
We show that relatively small specialized clinical models substantially outperform all in-context learning approaches, even when finetuned on limited annotated data.
Further, we find that pretraining on clinical tokens allows for smaller, more parameter-efficient models that either match or outperform much larger language models trained on general text.
We release the code and the models used under the PhysioNet Credentialed Health Data license and data use agreement.\footnote{\href{https://github.com/elehman16/clinical_llm}{https://github.com/elehman16/clinical_llm}}},
}