diff --git a/commons/dataset.py b/commons/dataset.py index bf646426..14d9112a 100644 --- a/commons/dataset.py +++ b/commons/dataset.py @@ -4,6 +4,7 @@ import bittensor as bt from datasets import load_dataset, interleave_datasets +from commons.factory import Factory from template.protocol import Completion, RankingRequest @@ -116,7 +117,7 @@ def get_batch(cls) -> List[Dict]: dataset_names = list(cls._eval_datasets.keys()) key = random.choice(dataset_names) bt.logging.info(f"Using dataset: {key}, for evaluation") - batch_size = 32 + batch_size = Factory.get_config().evaluation.batch_size return [next_circular(cls._eval_datasets, key) for _ in range(batch_size)] diff --git a/commons/evals.py b/commons/evals.py index 330e134b..ce63839a 100644 --- a/commons/evals.py +++ b/commons/evals.py @@ -28,7 +28,7 @@ async def classification_accuracy( model_config: ModelConfig = None, ) -> float: total_accuracy = 0 - num_batches = Factory.get_config().eval.num_batches + num_batches = Factory.get_config().evaluation.num_batches for _ in range(num_batches): batch_human_preference = EvalDatasetManager.get_batch() if scoring_method == ScoringMethod.HF_MODEL: diff --git a/template/utils/config.py b/template/utils/config.py index 2728c34e..25c0436a 100644 --- a/template/utils/config.py +++ b/template/utils/config.py @@ -107,12 +107,19 @@ def add_args(parser): ) parser.add_argument( - "--eval.num_batches", + "--evaluation.num_batches", type=int, help="Number of batches from dataset to use when evaluating.", default=10, ) + parser.add_argument( + "--evaluation.batch_size", + type=int, + help="Number of rows of data from dataset to use when evaluating.", + default=32, + ) + parser.add_argument( "--neuron.sample_size", type=int,