generated from gpauloski/python-template
-
Notifications
You must be signed in to change notification settings - Fork 2
/
nougat_test.yaml
47 lines (42 loc) · 1.91 KB
/
nougat_test.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# The directory containing the pdfs to convert
pdf_dir: /lus/eagle/projects/RL-fold/psetty/projects/AuroraGPT/pubmed
# The output directory of the workflow
out_dir: /lus/eagle/projects/candle_aesp/ogokdemir/runs/nougat_test
# Only convert the first 100 pdfs for testing
num_conversions: 100
# The number of PDFs to parse in each parsl task
chunk_size: 5
parser_settings:
name: "nougat"
# Recommended batch size of pages (not pdfs) is 10, maximum that fits into A100 40GB.
batchsize: 10
# Path to download the checkpoint to. if already exists, will use existing.
checkpoint: /home/ogokdemir/nougat_wf/nougat_ckpts/base
# Set mmd_out to null if you don't want to write mmd files as a byproduct.
mmd_out: /lus/eagle/projects/candle_aesp/ogokdemir/runs/nougat_test/mmd_outs
# If set to false, a will skip the pdfs that already have a parsed mmd file in mmd_out
recompute: false
# Set to true if you want to use fp32 instead of bfloat16 (false is recommended)
full_precision: false
# If set to true, output text will be formatted as a markdown file.
markdown: true
# Preempt processing a paper if mode collapse causes repetitions (true is recommended)
skipping: true
# Path for the nougat-specific logs for the run.
nougat_logs_path: /home/ogokdemir/nougat_wf/nougat_logs/
# The compute settings for the workflow
compute_settings:
# The name of the compute platform to use
name: polaris
# The number of compute nodes to use
num_nodes: 2
# Make sure to update the path to your conda environment and HF cache
worker_init: "module load conda/2023-10-04; conda activate nougat-wf"
# The scheduler options to use when submitting jobs
scheduler_options: "#PBS -l filesystems=home:eagle:grand"
# Make sure to change the account to the account you want to charge
account: FoundEpidem
# The HPC queue to submit to
queue: debug-scaling
# The amount of time to request for your job
walltime: 01:00:00