params.yaml

###########################################################
# Environmental config
###########################################################
# these parameters will not actually be tied to any dvc stages
# but still referenced in files, thus when you change these
# it will not trigger dvc
# instead we assume that these are properly defined.

# change with your locations
# add as many as you want, they can be refered to by name in the
# jackhmmer step. That name parameter is DVC tracked, and it cross references
# the values here
sequence_databases:
  uniref100: /kfs2/projects/proteinml/datasets/uniref/uniref100.fasta

###########################################################
# High level parameters
###########################################################

# if the modeling you want to do does not require and MSA you can skip those steps
use_msa: true # must be true if a protein model requires MSA input

# Whether to use protein models, supervised, or both. Note that if both
# are passed, protein models is used as input for supervised a la hsu et al
# protein models are in most cases zero-shot or covariation methods that do not
# require training data, but this is not always the case.
# Experimental data can safely be passed to any protein model
# even if the model does not actually use it.
use_protein_models: true # whether to use protein models for scoring
use_supervised: true # note requires `data/experimental_data.csv` to be not empty

# If you want to score based on only specific positions instead of the whole sequences
# you can turn that on here. Not that not all models support this
# it is also incompatable with training and testing data that are not fixed length.
# if you do, a list of positiions can be passed eg [3, 4, 5] 1 indexed
# or a range with a string eg '3-5' inclusive.
position_specific_scoring: false


###########################################################
# Creation of MSA for covariation model training
###########################################################
msa_creation:
  # If `starting_sequences` The MSA is created using `data/starting_sequences.fa` and optionally, sequences from supervised training data
  # If `jackhmmer` The MSA is created using `jackhmmer` search against a database
  msa_mode: jackhmmer

  # for using predefined sequences
  # msa_mode: starting_sequences
  starting_sequences:
    prealigned: true # if true, the sequences are already aligned in a3m format
    # If you want to add sequences from any supervised training data provided
    # add_training_sequences: true
    # activity_targets: the target columns to use when determining the known actives to add
    # note that the threshold is non inclusive and greater than
    # if you want your target that is low-better to be considered active, you
    # must reformat the target to be high-better
    add_training_sequences: false # this requires `data/experimental_data.csv` to be not empty
    activity_targets: [0]
    activity_threshold: 0.0


  # for searching natural homologs
  # msa_mode: jackhmmer
  # note requires `data/wt.fa` to be not empty
  jackhmmer:
    seqdb: uniref100
    iterations: 5
    domain_threshold: 0.4
    sequence_threshold: 0.4
    use_bitscores: true
    sequence_identity_filter: 95
    minimum_sequence_coverage: 50
    minimum_column_coverage: 70
    theta: 0.8
    cpus: 16
    mx: 'BLOSUM62'

###########################################################
# Preprocessing of MSA for model training
###########################################################
msaprocessing:
  theta: 0.2
  use_weights: true
  preprocess: true
  threshold_sequence_frac_gaps: 0.5
  threshold_focus_cols_frac_gaps: 0.5
  remove_sequences_with_indeterminate_AA_in_focus_cols: true
  additional_weights: false

###########################################################
# Training and prediction
###########################################################
modeling:
  protein_models:
    # these are models that use protein sequences directly as input
    # Eg. HMM score, ESM log likelihood
    # If supervised is used, these will be used as input to the supervised model
    models: ['HMM']
    models_kwargs: '{"HMM": {}}'
  supervised:
    type: regression
    model: 'KernelRidge'
    model_kwargs: '{alpha: 5.0}'
    embeddings: ['ESM']
    embeddings_kwargs: '{"ESM": {}}'
    run_pca: true
    min_pca_variance_explained: 0.95
    standardize_X: true
    scale_y: true

###########################################################
# If training data given (zero only does not require it)
###########################################################
validation:
  do_cv: false
  cv: 5
  split_type: 'random'

###########################################################
# Prediction of variants
###########################################################
# Requires `data/variants.csv` to be not empty
# must have columns `id`, `sequence`

# Whether you are comparing the values to the input wild type or not
prediction_mode: 
  compare_to_wt: false