-
Notifications
You must be signed in to change notification settings - Fork 2
/
params.yaml
128 lines (113 loc) · 4.9 KB
/
params.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
###########################################################
# Environmental config
###########################################################
# these parameters will not actually be tied to any dvc stages
# but still referenced in files, thus when you change these
# it will not trigger dvc
# instead we assume that these are properly defined.
# change with your locations
# add as many as you want, they can be refered to by name in the
# jackhmmer step. That name parameter is DVC tracked, and it cross references
# the values here
sequence_databases:
uniref100: /kfs2/projects/proteinml/datasets/uniref/uniref100.fasta
###########################################################
# High level parameters
###########################################################
# if the modeling you want to do does not require and MSA you can skip those steps
use_msa: true # must be true if a protein model requires MSA input
# Whether to use protein models, supervised, or both. Note that if both
# are passed, protein models is used as input for supervised a la hsu et al
# protein models are in most cases zero-shot or covariation methods that do not
# require training data, but this is not always the case.
# Experimental data can safely be passed to any protein model
# even if the model does not actually use it.
use_protein_models: true # whether to use protein models for scoring
use_supervised: true # note requires `data/experimental_data.csv` to be not empty
# If you want to score based on only specific positions instead of the whole sequences
# you can turn that on here. Not that not all models support this
# it is also incompatable with training and testing data that are not fixed length.
# if you do, a list of positiions can be passed eg [3, 4, 5] 1 indexed
# or a range with a string eg '3-5' inclusive.
position_specific_scoring: false
###########################################################
# Creation of MSA for covariation model training
###########################################################
msa_creation:
# If `starting_sequences` The MSA is created using `data/starting_sequences.fa` and optionally, sequences from supervised training data
# If `jackhmmer` The MSA is created using `jackhmmer` search against a database
msa_mode: jackhmmer
# for using predefined sequences
# msa_mode: starting_sequences
starting_sequences:
prealigned: true # if true, the sequences are already aligned in a3m format
# If you want to add sequences from any supervised training data provided
# add_training_sequences: true
# activity_targets: the target columns to use when determining the known actives to add
# note that the threshold is non inclusive and greater than
# if you want your target that is low-better to be considered active, you
# must reformat the target to be high-better
add_training_sequences: false # this requires `data/experimental_data.csv` to be not empty
activity_targets: [0]
activity_threshold: 0.0
# for searching natural homologs
# msa_mode: jackhmmer
# note requires `data/wt.fa` to be not empty
jackhmmer:
seqdb: uniref100
iterations: 5
domain_threshold: 0.4
sequence_threshold: 0.4
use_bitscores: true
sequence_identity_filter: 95
minimum_sequence_coverage: 50
minimum_column_coverage: 70
theta: 0.8
cpus: 16
mx: 'BLOSUM62'
###########################################################
# Preprocessing of MSA for model training
###########################################################
msaprocessing:
theta: 0.2
use_weights: true
preprocess: true
threshold_sequence_frac_gaps: 0.5
threshold_focus_cols_frac_gaps: 0.5
remove_sequences_with_indeterminate_AA_in_focus_cols: true
additional_weights: false
###########################################################
# Training and prediction
###########################################################
modeling:
protein_models:
# these are models that use protein sequences directly as input
# Eg. HMM score, ESM log likelihood
# If supervised is used, these will be used as input to the supervised model
models: ['HMM']
models_kwargs: '{"HMM": {}}'
supervised:
type: regression
model: 'KernelRidge'
model_kwargs: '{alpha: 5.0}'
embeddings: ['ESM']
embeddings_kwargs: '{"ESM": {}}'
run_pca: true
min_pca_variance_explained: 0.95
standardize_X: true
scale_y: true
###########################################################
# If training data given (zero only does not require it)
###########################################################
validation:
do_cv: false
cv: 5
split_type: 'random'
###########################################################
# Prediction of variants
###########################################################
# Requires `data/variants.csv` to be not empty
# must have columns `id`, `sequence`
# Whether you are comparing the values to the input wild type or not
prediction_mode:
compare_to_wt: false