params.yaml

extract_text:
  orig_document_s3_bucket: cornell-mfd64
generate_dataset:
  seed: 42
  # The fraction of the dataset that will be used for the test split. The
  # remainder will be used for the training split.
  test_split_frac: 0.3

# Whether or not to publish the resulting datasets to HuggingFace Hub. If False,
# the dataset will only be saved locally.
publish_datasets: true

eval:
  # The number of rows of ground truth to use for generating results. Lower this
  # to reduce time and cost of evaluation. Rows are consumed in order, so a
  # value of 10 implies that the first 10 rows of the ground truth are consumed.
  num-eval-rows: 10
  # The terms to run evaluation against. See `thesaurus.json` for a list of
  # valid terms.
  terms:
  - min_lot_size
  - min_unit_size
  - max_height
  - max_lot_coverage
  - max_lot_coverage_pavement
  - min_parking_spaces
  search-method: elasticsearch
  # The extraction method to use on located pages. See `extract.py` for a list
  # of valid search methods.
  extraction-method: map
  # The number of results to consider when evaluating extraction accuracy and
  # page search recall. e.g., k=6 implies that an extraction is correct if any
  # of the top 6 results contain the correct answer.
  k: 12