diff --git a/apps/build_disk_index.cpp b/apps/build_disk_index.cpp index 1edb027da..b617a5f4a 100644 --- a/apps/build_disk_index.cpp +++ b/apps/build_disk_index.cpp @@ -9,6 +9,7 @@ #include "math_utils.h" #include "index.h" #include "partition.h" +#include "program_options_utils.hpp" namespace po = boost::program_options; @@ -21,61 +22,65 @@ int main(int argc, char **argv) bool append_reorder_data = false; bool use_opq = false; - po::options_description desc{"Arguments"}; + po::options_description desc{ + program_options_utils::make_program_description("build_disk_index", "Build a disk-based index.")}; try { desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), "distance function "); - desc.add_options()("data_path", po::value(&data_path)->required(), - "Input data file in bin format"); - desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), - "Path prefix for saving index file components"); - desc.add_options()("max_degree,R", po::value(&R)->default_value(64), "Maximum graph degree"); - desc.add_options()("Lbuild,L", po::value(&L)->default_value(100), - "Build complexity, higher value results in better graphs"); - desc.add_options()("search_DRAM_budget,B", po::value(&B)->required(), - "DRAM budget in GB for searching the index to set the " - "compressed level for data while search happens"); - desc.add_options()("build_DRAM_budget,M", po::value(&M)->required(), - "DRAM budget in GB for building the index"); - desc.add_options()("num_threads,T", po::value(&num_threads)->default_value(omp_get_num_procs()), - "Number of threads used for building index (defaults to " - "omp_get_num_procs())"); - desc.add_options()("QD", po::value(&QD)->default_value(0), " Quantized Dimension for compression"); - desc.add_options()("codebook_prefix", po::value(&codebook_prefix)->default_value(""), - "Path prefix for pre-trained codebook"); - desc.add_options()("PQ_disk_bytes", po::value(&disk_PQ)->default_value(0), - "Number of bytes to which vectors should be compressed " - "on SSD; 0 for no compression"); - desc.add_options()("append_reorder_data", po::bool_switch()->default_value(false), - "Include full precision data in the index. Use only in " - "conjuction with compressed data on SSD."); - desc.add_options()("build_PQ_bytes", po::value(&build_PQ)->default_value(0), - "Number of PQ bytes to build the index; 0 for full " - "precision build"); - desc.add_options()("use_opq", po::bool_switch()->default_value(false), - "Use Optimized Product Quantization (OPQ)."); - desc.add_options()("label_file", po::value(&label_file)->default_value(""), - "Input label file in txt format for Filtered Index build ." - "The file should contain comma separated filters for each node " - "with each line corresponding to a graph node"); - desc.add_options()("universal_label", po::value(&universal_label)->default_value(""), - "Universal label, Use only in conjuction with label file for " - "filtered " - "index build. If a graph node has all the labels against it, we " - "can " - "assign a special universal filter to the point instead of comma " - "separated filters for that point"); - desc.add_options()("FilteredLbuild", po::value(&Lf)->default_value(0), - "Build complexity for filtered points, higher value " - "results in better graphs"); - desc.add_options()("filter_threshold,F", po::value(&filter_threshold)->default_value(0), - "Threshold to break up the existing nodes to generate new graph " - "internally where each node has a maximum F labels."); - desc.add_options()("label_type", po::value(&label_type)->default_value("uint"), - "Storage type of Labels , default value is uint which " - "will consume memory 4 bytes per filter"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + required_configs.add_options()("search_DRAM_budget,B", po::value(&B)->required(), + "DRAM budget in GB for searching the index to set the " + "compressed level for data while search happens"); + required_configs.add_options()("build_DRAM_budget,M", po::value(&M)->required(), + "DRAM budget in GB for building the index"); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("QD", po::value(&QD)->default_value(0), + " Quantized Dimension for compression"); + optional_configs.add_options()("codebook_prefix", po::value(&codebook_prefix)->default_value(""), + "Path prefix for pre-trained codebook"); + optional_configs.add_options()("PQ_disk_bytes", po::value(&disk_PQ)->default_value(0), + "Number of bytes to which vectors should be compressed " + "on SSD; 0 for no compression"); + optional_configs.add_options()("append_reorder_data", po::bool_switch()->default_value(false), + "Include full precision data in the index. Use only in " + "conjuction with compressed data on SSD."); + optional_configs.add_options()("build_PQ_bytes", po::value(&build_PQ)->default_value(0), + program_options_utils::BUIlD_GRAPH_PQ_BYTES); + optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false), + program_options_utils::USE_OPQ); + optional_configs.add_options()("label_file", po::value(&label_file)->default_value(""), + program_options_utils::LABEL_FILE); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + program_options_utils::UNIVERSAL_LABEL); + optional_configs.add_options()("FilteredLbuild", po::value(&Lf)->default_value(0), + program_options_utils::FILTERED_LBUILD); + optional_configs.add_options()("filter_threshold,F", po::value(&filter_threshold)->default_value(0), + "Threshold to break up the existing nodes to generate new graph " + "internally where each node has a maximum F labels."); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + program_options_utils::LABEL_TYPE_DESCRIPTION); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); diff --git a/apps/build_memory_index.cpp b/apps/build_memory_index.cpp index d96ad7f50..92b269f4f 100644 --- a/apps/build_memory_index.cpp +++ b/apps/build_memory_index.cpp @@ -7,6 +7,7 @@ #include "index.h" #include "utils.h" +#include "program_options_utils.hpp" #ifndef _WINDOWS #include @@ -72,47 +73,50 @@ int main(int argc, char **argv) float alpha; bool use_pq_build, use_opq; - po::options_description desc{"Arguments"}; + po::options_description desc{ + program_options_utils::make_program_description("build_memory_index", "Build a memory-based DiskANN index.")}; try { desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), - "distance function "); - desc.add_options()("data_path", po::value(&data_path)->required(), - "Input data file in bin format"); - desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), - "Path prefix for saving index file components"); - desc.add_options()("max_degree,R", po::value(&R)->default_value(64), "Maximum graph degree"); - desc.add_options()("Lbuild,L", po::value(&L)->default_value(100), - "Build complexity, higher value results in better graphs"); - desc.add_options()("alpha", po::value(&alpha)->default_value(1.2f), - "alpha controls density and diameter of graph, set " - "1 for sparse graph, " - "1.2 or 1.4 for denser graphs with lower diameter"); - desc.add_options()("num_threads,T", po::value(&num_threads)->default_value(omp_get_num_procs()), - "Number of threads used for building index (defaults to " - "omp_get_num_procs())"); - desc.add_options()("build_PQ_bytes", po::value(&build_PQ_bytes)->default_value(0), - "Number of PQ bytes to build the index; 0 for full precision " - "build"); - desc.add_options()("use_opq", po::bool_switch()->default_value(false), - "Set true for OPQ compression while using PQ " - "distance comparisons for " - "building the index, and false for PQ compression"); - desc.add_options()("label_file", po::value(&label_file)->default_value(""), - "Input label file in txt format for Filtered Index search. " - "The file should contain comma separated filters for each node " - "with each line corresponding to a graph node"); - desc.add_options()("universal_label", po::value(&universal_label)->default_value(""), - "Universal label, if using it, only in conjunction with " - "labels_file"); - desc.add_options()("FilteredLbuild", po::value(&Lf)->default_value(0), - "Build complexity for filtered points, higher value " - "results in better graphs"); - desc.add_options()("label_type", po::value(&label_type)->default_value("uint"), - "Storage type of Labels , default value is uint which " - "will consume memory 4 bytes per filter"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("alpha", po::value(&alpha)->default_value(1.2f), + program_options_utils::GRAPH_BUILD_ALPHA); + optional_configs.add_options()("build_PQ_bytes", po::value(&build_PQ_bytes)->default_value(0), + program_options_utils::BUIlD_GRAPH_PQ_BYTES); + optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false), + program_options_utils::USE_OPQ); + optional_configs.add_options()("label_file", po::value(&label_file)->default_value(""), + program_options_utils::LABEL_FILE); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + program_options_utils::UNIVERSAL_LABEL); + + optional_configs.add_options()("FilteredLbuild", po::value(&Lf)->default_value(0), + program_options_utils::FILTERED_LBUILD); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + program_options_utils::LABEL_TYPE_DESCRIPTION); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); diff --git a/apps/build_stitched_index.cpp b/apps/build_stitched_index.cpp index 4c1941a9d..80481f8b0 100644 --- a/apps/build_stitched_index.cpp +++ b/apps/build_stitched_index.cpp @@ -18,6 +18,7 @@ #include "memory_mapper.h" #include "parameters.h" #include "utils.h" +#include "program_options_utils.hpp" namespace po = boost::program_options; typedef std::tuple>, uint64_t> stitch_indices_return_values; @@ -54,33 +55,42 @@ void handle_args(int argc, char **argv, std::string &data_type, path &input_data path &label_data_path, std::string &universal_label, uint32_t &num_threads, uint32_t &R, uint32_t &L, uint32_t &stitched_R, float &alpha) { - po::options_description desc{"Arguments"}; + po::options_description desc{ + program_options_utils::make_program_description("build_stitched_index", "Build a stitched DiskANN index.")}; try { desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("data_path", po::value(&input_data_path)->required(), "Input data file in bin format"); - desc.add_options()("index_path_prefix", po::value(&final_index_path_prefix)->required(), - "Path prefix for saving index file components"); - desc.add_options()("max_degree,R", po::value(&R)->default_value(64), "Maximum graph degree"); - desc.add_options()("Lbuild,L", po::value(&L)->default_value(100), - "Build complexity, higher value results in better graphs"); - desc.add_options()("stitched_R", po::value(&stitched_R)->default_value(100), - "Degree to prune final graph down to"); - desc.add_options()("alpha", po::value(&alpha)->default_value(1.2f), - "alpha controls density and diameter of graph, set " - "1 for sparse graph, " - "1.2 or 1.4 for denser graphs with lower diameter"); - desc.add_options()("num_threads,T", po::value(&num_threads)->default_value(omp_get_num_procs()), - "Number of threads used for building index (defaults to " - "omp_get_num_procs())"); - desc.add_options()("label_file", po::value(&label_data_path)->default_value(""), - "Input label file in txt format if present"); - desc.add_options()("universal_label", po::value(&universal_label)->default_value(""), - "If a point comes with the specified universal label (and only the " - "univ. " - "label), then the point is considered to have every possible " - "label"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("index_path_prefix", + po::value(&final_index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&input_data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("alpha", po::value(&alpha)->default_value(1.2f), + program_options_utils::GRAPH_BUILD_ALPHA); + optional_configs.add_options()("label_file", po::value(&label_data_path)->default_value(""), + program_options_utils::LABEL_FILE); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + program_options_utils::UNIVERSAL_LABEL); + optional_configs.add_options()("stitched_R", po::value(&stitched_R)->default_value(100), + "Degree to prune final graph down to"); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); diff --git a/apps/range_search_disk_index.cpp b/apps/range_search_disk_index.cpp index 33a7283a7..31675724b 100644 --- a/apps/range_search_disk_index.cpp +++ b/apps/range_search_disk_index.cpp @@ -15,6 +15,7 @@ #include "pq_flash_index.h" #include "partition.h" #include "timer.h" +#include "program_options_utils.hpp" #ifndef _WINDOWS #include @@ -273,29 +274,42 @@ int main(int argc, char **argv) std::vector Lvec; float range; - po::options_description desc{"Arguments"}; + po::options_description desc{program_options_utils::make_program_description( + "range_search_disk_index", "Searches disk DiskANN indexes using ranges")}; try { desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), - "distance function "); - desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), - "Path prefix to the index"); - desc.add_options()("query_file", po::value(&query_file)->required(), - "Query file in binary format"); - desc.add_options()("gt_file", po::value(>_file)->default_value(std::string("null")), - "ground truth file for the queryset"); - desc.add_options()("range_threshold,K", po::value(&range)->required(), - "Number of neighbors to be returned"); - desc.add_options()("search_list,L", po::value>(&Lvec)->multitoken(), - "List of L values of search"); - desc.add_options()("beamwidth,W", po::value(&W)->default_value(2), "Beamwidth for search"); - desc.add_options()("num_nodes_to_cache", po::value(&num_nodes_to_cache)->default_value(100000), - "Beamwidth for search"); - desc.add_options()("num_threads,T", po::value(&num_threads)->default_value(omp_get_num_procs()), - "Number of threads used for building index (defaults to " - "omp_get_num_procs())"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("query_file", po::value(&query_file)->required(), + program_options_utils::QUERY_FILE_DESCRIPTION); + required_configs.add_options()("search_list,L", + po::value>(&Lvec)->multitoken()->required(), + program_options_utils::SEARCH_LIST_DESCRIPTION); + required_configs.add_options()("range_threshold,K", po::value(&range)->required(), + "Number of neighbors to be returned"); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("gt_file", po::value(>_file)->default_value(std::string("null")), + program_options_utils::GROUND_TRUTH_FILE_DESCRIPTION); + optional_configs.add_options()("num_nodes_to_cache", po::value(&num_nodes_to_cache)->default_value(0), + program_options_utils::NUMBER_OF_NODES_TO_CACHE); + optional_configs.add_options()("beamwidth,W", po::value(&W)->default_value(2), + program_options_utils::BEAMWIDTH); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); diff --git a/apps/search_disk_index.cpp b/apps/search_disk_index.cpp index 1108da97e..b46b37aef 100644 --- a/apps/search_disk_index.cpp +++ b/apps/search_disk_index.cpp @@ -12,6 +12,7 @@ #include "pq_flash_index.h" #include "timer.h" #include "percentile_stats.h" +#include "program_options_utils.hpp" #ifndef _WINDOWS #include @@ -318,48 +319,62 @@ int main(int argc, char **argv) bool use_reorder_data = false; float fail_if_recall_below = 0.0f; - po::options_description desc{"Arguments"}; + po::options_description desc{ + program_options_utils::make_program_description("search_disk_index", "Searches on-disk DiskANN indexes")}; try { desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), - "distance function "); - desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), - "Path prefix to the index"); - desc.add_options()("result_path", po::value(&result_path_prefix)->required(), - "Path prefix for saving results of the queries"); - desc.add_options()("query_file", po::value(&query_file)->required(), - "Query file in binary format"); - desc.add_options()("gt_file", po::value(>_file)->default_value(std::string("null")), - "ground truth file for the queryset"); - desc.add_options()("recall_at,K", po::value(&K)->required(), "Number of neighbors to be returned"); - desc.add_options()("search_list,L", po::value>(&Lvec)->multitoken(), - "List of L values of search"); - desc.add_options()("beamwidth,W", po::value(&W)->default_value(2), - "Beamwidth for search. Set 0 to optimize internally."); - desc.add_options()("num_nodes_to_cache", po::value(&num_nodes_to_cache)->default_value(0), - "Beamwidth for search"); - desc.add_options()("search_io_limit", - po::value(&search_io_limit)->default_value(std::numeric_limits::max()), - "Max #IOs for search"); - desc.add_options()("num_threads,T", po::value(&num_threads)->default_value(omp_get_num_procs()), - "Number of threads used for building index (defaults to " - "omp_get_num_procs())"); - desc.add_options()("use_reorder_data", po::bool_switch()->default_value(false), - "Include full precision data in the index. Use only in " - "conjuction with compressed data on SSD."); - desc.add_options()("filter_label", po::value(&filter_label)->default_value(std::string("")), - "Filter Label for Filtered Search"); - desc.add_options()("query_filters_file", - po::value(&query_filters_file)->default_value(std::string("")), - "Filter file for Queries for Filtered Search "); - desc.add_options()("label_type", po::value(&label_type)->default_value("uint"), - "Storage type of Labels , default value is uint which " - "will consume memory 4 bytes per filter"); - desc.add_options()("fail_if_recall_below", po::value(&fail_if_recall_below)->default_value(0.0f), - "If set to a value >0 and <100%, program returns -1 if best recall " - "found is below this threshold. "); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("result_path", po::value(&result_path_prefix)->required(), + program_options_utils::RESULT_PATH_DESCRIPTION); + required_configs.add_options()("query_file", po::value(&query_file)->required(), + program_options_utils::QUERY_FILE_DESCRIPTION); + required_configs.add_options()("recall_at,K", po::value(&K)->required(), + program_options_utils::NUMBER_OF_RESULTS_DESCRIPTION); + required_configs.add_options()("search_list,L", + po::value>(&Lvec)->multitoken()->required(), + program_options_utils::SEARCH_LIST_DESCRIPTION); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("gt_file", po::value(>_file)->default_value(std::string("null")), + program_options_utils::GROUND_TRUTH_FILE_DESCRIPTION); + optional_configs.add_options()("beamwidth,W", po::value(&W)->default_value(2), + program_options_utils::BEAMWIDTH); + optional_configs.add_options()("num_nodes_to_cache", po::value(&num_nodes_to_cache)->default_value(0), + program_options_utils::NUMBER_OF_NODES_TO_CACHE); + optional_configs.add_options()( + "search_io_limit", + po::value(&search_io_limit)->default_value(std::numeric_limits::max()), + "Max #IOs for search. Default value: uint32::max()"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("use_reorder_data", po::bool_switch()->default_value(false), + "Include full precision data in the index. Use only in " + "conjuction with compressed data on SSD. Default value: false"); + optional_configs.add_options()("filter_label", + po::value(&filter_label)->default_value(std::string("")), + program_options_utils::FILTER_LABEL_DESCRIPTION); + optional_configs.add_options()("query_filters_file", + po::value(&query_filters_file)->default_value(std::string("")), + program_options_utils::FILTERS_FILE_DESCRIPTION); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + program_options_utils::LABEL_TYPE_DESCRIPTION); + optional_configs.add_options()("fail_if_recall_below", + po::value(&fail_if_recall_below)->default_value(0.0f), + program_options_utils::FAIL_IF_RECALL_BELOW); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); diff --git a/apps/search_memory_index.cpp b/apps/search_memory_index.cpp index ca3045331..44817242c 100644 --- a/apps/search_memory_index.cpp +++ b/apps/search_memory_index.cpp @@ -20,6 +20,7 @@ #include "index.h" #include "memory_mapper.h" #include "utils.h" +#include "program_options_utils.hpp" #include "index_factory.h" namespace po = boost::program_options; @@ -271,48 +272,65 @@ int main(int argc, char **argv) bool print_all_recalls, dynamic, tags, show_qps_per_thread; float fail_if_recall_below = 0.0f; - po::options_description desc{"Arguments"}; + po::options_description desc{ + program_options_utils::make_program_description("search_memory_index", "Searches in-memory DiskANN indexes")}; try { - desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), - "distance function "); - desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), - "Path prefix to the index"); - desc.add_options()("result_path", po::value(&result_path)->required(), - "Path prefix for saving results of the queries"); - desc.add_options()("query_file", po::value(&query_file)->required(), - "Query file in binary format"); - desc.add_options()("filter_label", po::value(&filter_label)->default_value(std::string("")), - "Filter Label for Filtered Search"); - desc.add_options()("query_filters_file", - po::value(&query_filters_file)->default_value(std::string("")), - "Filter file for Queries for Filtered Search "); - desc.add_options()("label_type", po::value(&label_type)->default_value("uint"), - "Storage type of Labels , default value is uint which " - "will consume memory 4 bytes per filter"); - desc.add_options()("gt_file", po::value(>_file)->default_value(std::string("null")), - "ground truth file for the queryset"); - desc.add_options()("recall_at,K", po::value(&K)->required(), "Number of neighbors to be returned"); - desc.add_options()("print_all_recalls", po::bool_switch(&print_all_recalls), - "Print recalls at all positions, from 1 up to specified " - "recall_at value"); - desc.add_options()("search_list,L", po::value>(&Lvec)->multitoken(), - "List of L values of search"); - desc.add_options()("num_threads,T", po::value(&num_threads)->default_value(omp_get_num_procs()), - "Number of threads used for building index (defaults to " - "omp_get_num_procs())"); - desc.add_options()("dynamic", po::value(&dynamic)->default_value(false), - "Whether the index is dynamic. Default false."); - desc.add_options()("tags", po::value(&tags)->default_value(false), - "Whether to search with tags. Default false."); - desc.add_options()("qps_per_thread", po::bool_switch(&show_qps_per_thread), - "Print overall QPS divided by the number of threads in " - "the output table"); - desc.add_options()("fail_if_recall_below", po::value(&fail_if_recall_below)->default_value(0.0f), - "If set to a value >0 and <100%, program returns -1 if best recall " - "found is below this threshold. "); + desc.add_options()("help,h", "Print this information on arguments"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("result_path", po::value(&result_path)->required(), + program_options_utils::RESULT_PATH_DESCRIPTION); + required_configs.add_options()("query_file", po::value(&query_file)->required(), + program_options_utils::QUERY_FILE_DESCRIPTION); + required_configs.add_options()("recall_at,K", po::value(&K)->required(), + program_options_utils::NUMBER_OF_RESULTS_DESCRIPTION); + required_configs.add_options()("search_list,L", + po::value>(&Lvec)->multitoken()->required(), + program_options_utils::SEARCH_LIST_DESCRIPTION); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("filter_label", + po::value(&filter_label)->default_value(std::string("")), + program_options_utils::FILTER_LABEL_DESCRIPTION); + optional_configs.add_options()("query_filters_file", + po::value(&query_filters_file)->default_value(std::string("")), + program_options_utils::FILTERS_FILE_DESCRIPTION); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + program_options_utils::LABEL_TYPE_DESCRIPTION); + optional_configs.add_options()("gt_file", po::value(>_file)->default_value(std::string("null")), + program_options_utils::GROUND_TRUTH_FILE_DESCRIPTION); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()( + "dynamic", po::value(&dynamic)->default_value(false), + "Whether the index is dynamic. Dynamic indices must have associated tags. Default false."); + optional_configs.add_options()("tags", po::value(&tags)->default_value(false), + "Whether to search with external identifiers (tags). Default false."); + optional_configs.add_options()("fail_if_recall_below", + po::value(&fail_if_recall_below)->default_value(0.0f), + program_options_utils::FAIL_IF_RECALL_BELOW); + + // Output controls + po::options_description output_controls("Output controls"); + output_controls.add_options()("print_all_recalls", po::bool_switch(&print_all_recalls), + "Print recalls at all positions, from 1 up to specified " + "recall_at value"); + output_controls.add_options()("print_qps_per_thread", po::bool_switch(&show_qps_per_thread), + "Print overall QPS divided by the number of threads in " + "the output table"); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs).add(output_controls); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); diff --git a/apps/test_insert_deletes_consolidate.cpp b/apps/test_insert_deletes_consolidate.cpp index 4d64de3a5..700f4d7b6 100644 --- a/apps/test_insert_deletes_consolidate.cpp +++ b/apps/test_insert_deletes_consolidate.cpp @@ -11,6 +11,7 @@ #include #include "utils.h" +#include "program_options_utils.hpp" #include "index_factory.h" #ifndef _WINDOWS @@ -336,49 +337,63 @@ int main(int argc, char **argv) points_to_delete_from_beginning, start_deletes_after; bool concurrent; - po::options_description desc{"Arguments"}; + po::options_description desc{program_options_utils::make_program_description("test_insert_deletes_consolidate", + "Test insert deletes & consolidate")}; try { desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), "distance function "); - desc.add_options()("data_path", po::value(&data_path)->required(), - "Input data file in bin format"); - desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), - "Path prefix for saving index file components"); - desc.add_options()("max_degree,R", po::value(&R)->default_value(64), "Maximum graph degree"); - desc.add_options()("Lbuild,L", po::value(&L)->default_value(100), - "Build complexity, higher value results in better graphs"); - desc.add_options()("alpha", po::value(&alpha)->default_value(1.2f), - "alpha controls density and diameter of graph, set " - "1 for sparse graph, " - "1.2 or 1.4 for denser graphs with lower diameter"); - desc.add_options()("num_threads,T", po::value(&num_threads)->default_value(omp_get_num_procs()), - "Number of threads used for building index (defaults to " - "omp_get_num_procs())"); - desc.add_options()("points_to_skip", po::value(&points_to_skip)->required(), - "Skip these first set of points from file"); - desc.add_options()("max_points_to_insert", po::value(&max_points_to_insert)->default_value(0), - "These number of points from the file are inserted after " - "points_to_skip"); - desc.add_options()("beginning_index_size", po::value(&beginning_index_size)->required(), - "Batch build will be called on these set of points"); - desc.add_options()("points_per_checkpoint", po::value(&points_per_checkpoint)->required(), - "Insertions are done in batches of points_per_checkpoint"); - desc.add_options()("checkpoints_per_snapshot", po::value(&checkpoints_per_snapshot)->required(), - "Save the index to disk every few checkpoints"); - desc.add_options()("points_to_delete_from_beginning", - po::value(&points_to_delete_from_beginning)->required(), ""); - desc.add_options()("do_concurrent", po::value(&concurrent)->default_value(false), ""); - desc.add_options()("start_deletes_after", po::value(&start_deletes_after)->default_value(0), ""); - desc.add_options()("start_point_norm", po::value(&start_point_norm)->default_value(0), - "Set the start point to a random point on a sphere of this radius"); - desc.add_options()( + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + required_configs.add_options()("points_to_skip", po::value(&points_to_skip)->required(), + "Skip these first set of points from file"); + required_configs.add_options()("beginning_index_size", po::value(&beginning_index_size)->required(), + "Batch build will be called on these set of points"); + required_configs.add_options()("points_per_checkpoint", po::value(&points_per_checkpoint)->required(), + "Insertions are done in batches of points_per_checkpoint"); + required_configs.add_options()("checkpoints_per_snapshot", + po::value(&checkpoints_per_snapshot)->required(), + "Save the index to disk every few checkpoints"); + required_configs.add_options()("points_to_delete_from_beginning", + po::value(&points_to_delete_from_beginning)->required(), ""); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("alpha", po::value(&alpha)->default_value(1.2f), + program_options_utils::GRAPH_BUILD_ALPHA); + optional_configs.add_options()("max_points_to_insert", + po::value(&max_points_to_insert)->default_value(0), + "These number of points from the file are inserted after " + "points_to_skip"); + optional_configs.add_options()("do_concurrent", po::value(&concurrent)->default_value(false), ""); + optional_configs.add_options()("start_deletes_after", + po::value(&start_deletes_after)->default_value(0), ""); + optional_configs.add_options()("start_point_norm", po::value(&start_point_norm)->default_value(0), + "Set the start point to a random point on a sphere of this radius"); + optional_configs.add_options()( "num_start_points", po::value(&num_start_pts)->default_value(diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC), "Set the number of random start (frozen) points to use when " "inserting and searching"); + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); + po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); if (vm.count("help")) diff --git a/apps/test_streaming_scenario.cpp b/apps/test_streaming_scenario.cpp index c48c74843..55e4e61cf 100644 --- a/apps/test_streaming_scenario.cpp +++ b/apps/test_streaming_scenario.cpp @@ -13,6 +13,7 @@ #include #include "utils.h" +#include "program_options_utils.hpp" #ifndef _WINDOWS #include @@ -299,50 +300,61 @@ int main(int argc, char **argv) float alpha, start_point_norm; size_t max_points_to_insert, active_window, consolidate_interval; - po::options_description desc{"Arguments"}; + po::options_description desc{program_options_utils::make_program_description("test_streaming_scenario", + "Test insert deletes & consolidate")}; try { desc.add_options()("help,h", "Print information on arguments"); - desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), "distance function "); - desc.add_options()("data_path", po::value(&data_path)->required(), - "Input data file in bin format"); - desc.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), - "Path prefix for saving index file components"); - desc.add_options()("max_degree,R", po::value(&R)->default_value(64), "Maximum graph degree"); - desc.add_options()("Lbuild,L", po::value(&L)->default_value(100), - "Build complexity, higher value results in better graphs"); - desc.add_options()("alpha", po::value(&alpha)->default_value(1.2f), - "alpha controls density and diameter of graph, set " - "1 for sparse graph, " - "1.2 or 1.4 for denser graphs with lower diameter"); - desc.add_options()("insert_threads", - po::value(&insert_threads)->default_value(omp_get_num_procs() / 2), - "Number of threads used for inserting into the index (defaults to " - "omp_get_num_procs()/2)"); - desc.add_options()("consolidate_threads", - po::value(&consolidate_threads)->default_value(omp_get_num_procs() / 2), - "Number of threads used for consolidating deletes to " - "the index (defaults to omp_get_num_procs()/2)"); - - desc.add_options()("max_points_to_insert", po::value(&max_points_to_insert)->default_value(0), - "The number of points from the file that the program streams " - "over "); - desc.add_options()("active_window", po::value(&active_window)->required(), - "Program maintains an index over an active window of " - "this size that slides through the data"); - desc.add_options()("consolidate_interval", po::value(&consolidate_interval)->required(), - "The program simultaneously adds this number of points to the " - "right of " - "the window while deleting the same number from the left"); - desc.add_options()("start_point_norm", po::value(&start_point_norm)->required(), - "Set the start point to a random point on a sphere of this radius"); - desc.add_options()( + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + required_configs.add_options()("active_window", po::value(&active_window)->required(), + "Program maintains an index over an active window of " + "this size that slides through the data"); + required_configs.add_options()("consolidate_interval", po::value(&consolidate_interval)->required(), + "The program simultaneously adds this number of points to the " + "right of " + "the window while deleting the same number from the left"); + required_configs.add_options()("start_point_norm", po::value(&start_point_norm)->required(), + "Set the start point to a random point on a sphere of this radius"); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("alpha", po::value(&alpha)->default_value(1.2f), + program_options_utils::GRAPH_BUILD_ALPHA); + optional_configs.add_options()("insert_threads", + po::value(&insert_threads)->default_value(omp_get_num_procs() / 2), + "Number of threads used for inserting into the index (defaults to " + "omp_get_num_procs()/2)"); + optional_configs.add_options()( + "consolidate_threads", po::value(&consolidate_threads)->default_value(omp_get_num_procs() / 2), + "Number of threads used for consolidating deletes to " + "the index (defaults to omp_get_num_procs()/2)"); + optional_configs.add_options()("max_points_to_insert", + po::value(&max_points_to_insert)->default_value(0), + "The number of points from the file that the program streams " + "over "); + optional_configs.add_options()( "num_start_points", po::value(&num_start_pts)->default_value(diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC), "Set the number of random start (frozen) points to use when " "inserting and searching"); + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); + po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); if (vm.count("help")) diff --git a/clang-format.cmake b/clang-format.cmake index 97f6d7338..19bb3a850 100644 --- a/clang-format.cmake +++ b/clang-format.cmake @@ -2,7 +2,7 @@ if (NOT MSVC) message(STATUS "Setting up `make format` and `make checkformat`") # additional target to perform clang-format run, requires clang-format # get all project files - file(GLOB_RECURSE ALL_SOURCE_FILES include/*.h python/src/*.cpp src/*.cpp apps/*.cpp) + file(GLOB_RECURSE ALL_SOURCE_FILES include/*.h include/*.hpp python/src/*.cpp src/*.cpp src/*.hpp apps/*.cpp apps/*.hpp) message(status ${ALL_SOURCE_FILES}) diff --git a/include/program_options_utils.hpp b/include/program_options_utils.hpp new file mode 100644 index 000000000..71077b7b2 --- /dev/null +++ b/include/program_options_utils.hpp @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include + +namespace program_options_utils +{ +const std::string make_program_description(const char *executable_name, const char *description) +{ + return std::string("\n") + .append(description) + .append("\n\n") + .append("Usage: ") + .append(executable_name) + .append(" [OPTIONS]"); +} + +// Required parameters +const char *DATA_TYPE_DESCRIPTION = "data type, one of {int8, uint8, float} - float is single precision (32 bit)"; +const char *DISTANCE_FUNCTION_DESCRIPTION = + "distance function {l2, mips, fast_l2, cosine}. 'fast l2' and 'mips' only support data_type float"; +const char *INDEX_PATH_PREFIX_DESCRIPTION = "Path prefix to the index, e.g. '/mnt/data/my_ann_index'"; +const char *RESULT_PATH_DESCRIPTION = + "Path prefix for saving results of the queries, e.g. '/mnt/data/query_file_X.bin'"; +const char *QUERY_FILE_DESCRIPTION = "Query file in binary format, e.g. '/mnt/data/query_file_X.bin'"; +const char *NUMBER_OF_RESULTS_DESCRIPTION = "Number of neighbors to be returned (K in the DiskANN white paper)"; +const char *SEARCH_LIST_DESCRIPTION = + "Size of search list to use. This value is the number of neighbor/distance pairs to keep in memory at the same " + "time while performing a query. This can also be described as the size of the working set at query time. This " + "must be greater than or equal to the number of results/neighbors to return (K in the white paper). Corresponds " + "to L in the DiskANN white paper."; +const char *INPUT_DATA_PATH = "Input data file in bin format. This is the file you want to build the index over. " + "File format: Shape of the vector followed by the vector of embeddings as binary data."; + +// Optional parameters +const char *FILTER_LABEL_DESCRIPTION = + "Filter to use when running a query. 'filter_label' and 'query_filters_file' are mutually exclusive."; +const char *FILTERS_FILE_DESCRIPTION = + "Filter file for Queries for Filtered Search. File format is text with one filter per line. File must " + "have exactly one filter OR the same number of filters as there are queries in the 'query_file'."; +const char *LABEL_TYPE_DESCRIPTION = + "Storage type of Labels {uint/uint32, ushort/uint16}, default value is uint which will consume memory 4 bytes per " + "filter. 'uint' is an alias for 'uint32' and 'ushort' is an alias for 'uint16'."; +const char *GROUND_TRUTH_FILE_DESCRIPTION = + "ground truth file for the queryset"; // what's the format, what's the requirements? does it need to include an + // entry for every item or just a small subset? I have so many questions about + // this file +const char *NUMBER_THREADS_DESCRIPTION = "Number of threads used for building index. Defaults to number of logical " + "processor cores on your this machine returned by omp_get_num_procs()"; +const char *FAIL_IF_RECALL_BELOW = + "Value between 0 (inclusive) and 100 (exclusive) indicating the recall tolerance percentage threshold before " + "program fails with a non-zero exit code. The default value of 0 means that the program will complete " + "successfully with any recall value. A non-zero value indicates the floor for acceptable recall values. If the " + "calculated recall value is below this threshold then the program will write out the results but return a non-zero " + "exit code as a signal that the recall was not acceptable."; // does it continue running or die immediately? Will I + // still get my results even if the return code is -1? + +const char *NUMBER_OF_NODES_TO_CACHE = "Number of BFS nodes around medoid(s) to cache. Default value: 0"; +const char *BEAMWIDTH = "Beamwidth for search. Set 0 to optimize internally. Default value: 2"; +const char *MAX_BUILD_DEGREE = "Maximum graph degree"; +const char *GRAPH_BUILD_COMPLEXITY = + "Size of the search working set during build time. This is the numer of neighbor/distance pairs to keep in memory " + "while building the index. Higher value results in a higher quality graph but it will take more time to build the " + "graph."; +const char *GRAPH_BUILD_ALPHA = "Alpha controls density and diameter of graph, set 1 for sparse graph, 1.2 or 1.4 for " + "denser graphs with lower diameter"; +const char *BUIlD_GRAPH_PQ_BYTES = "Number of PQ bytes to build the index; 0 for full precision build"; +const char *USE_OPQ = "Use Optimized Product Quantization (OPQ)."; +const char *LABEL_FILE = "Input label file in txt format for Filtered Index build. The file should contain comma " + "separated filters for each node with each line corresponding to a graph node"; +const char *UNIVERSAL_LABEL = + "Universal label, Use only in conjunction with label file for filtered index build. If a " + "graph node has all the labels against it, we can assign a special universal filter to the " + "point instead of comma separated filters for that point"; +const char *FILTERED_LBUILD = "Build complexity for filtered points, higher value results in better graphs"; + +} // namespace program_options_utils