From c6ef28925c95255dd2af88ad4c7e2b7e6e1116fe Mon Sep 17 00:00:00 2001 From: Mengdi Lin Date: Tue, 14 May 2024 13:57:59 -0700 Subject: [PATCH] interrupt for NNDescent (#3432) Summary: Addresses the issue in https://github.com/facebookresearch/faiss/issues/3173 for `IndexNNDescent`, I see that there is already interrupt implemented for it's [search](https://fburl.com/code/iwn3tqic) API, so I looked into it's `add` API. For a given dataset nb = 10 mil, iter = 10, K = 32, d = 32 on a CPU only machine reveals that bulk of the cost comes from [nndescent](https://fburl.com/code/5rdb1p5o). For every iteration of `nndescent` takes around ~12 seconds, ~70-80% of the time is spent on `join` method (~10 seconds per iteration) and ~20-30% spent on `update` (~2 second per iteration). Adding the interrupt on the `join` should suffice on quickly terminating the program when users hit ctrl+C (happy to move the interrupt elsewhere if we think otherwise) Reviewed By: junjieqi, mdouze Differential Revision: D57300514 --- .circleci/config.yml | 13 +++++----- .circleci/setup-clang-format.sh | 43 +++++++++++++++++++++++++++++++++ faiss/impl/NNDescent.cpp | 21 ++++++++++------ 3 files changed, 62 insertions(+), 15 deletions(-) create mode 100755 .circleci/setup-clang-format.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 7e8bd8170a..07eb9d2542 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -39,16 +39,15 @@ jobs: name: Install clang-format command: | apt-get update -y - apt-get install -y wget - apt install -y lsb-release wget software-properties-common gnupg - wget https://apt.llvm.org/llvm.sh - chmod u+x llvm.sh - ./llvm.sh 18 - apt-get install -y git-core clang-format-18 + apt-get install -y curl tar gzip unzip + bash .circleci/setup-clang-format.sh + apt-get install -y git-core - run: name: Verify clang-format command: | - git ls-files | grep -E '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i + ls /root/project/clang-format/clang_format/data/bin/clang-format + git ls-files | grep -E '\.(cpp|h|cu|cuh)$' + git ls-files | grep -E '\.(cpp|h|cu|cuh)$' | xargs /root/project/clang-format/clang_format/data/bin/clang-format -i if git diff --quiet; then echo "Formatting OK!" else diff --git a/.circleci/setup-clang-format.sh b/.circleci/setup-clang-format.sh new file mode 100755 index 0000000000..a94691f9e1 --- /dev/null +++ b/.circleci/setup-clang-format.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +set -euo pipefail +# https://reproducible-builds.org/docs/archives/ +deterministic_tar_gz() { + # Use year 2030 to thwart tmpreaper. + tar \ + --sort=name \ + --mtime=2030-01-01T00:00:00Z \ + --owner=0 --group=0 --numeric-owner \ + -cf- \ + "${@:2}" \ + | gzip -9n \ + > "$1" +} + +# To curl from devservers +if host -W 1 fwdproxy >/dev/null; then + curl() { HTTP_PROXY=fwdproxy:8080 HTTPS_PROXY=fwdproxy:8080 command curl "$@"; } +fi + +### CHANGE THESE VARIABLES WHEN UPDATING ### + +# https://pypi.org/project/clang-format/18.1.3 + + +LINUX_X86_64_URL=https://files.pythonhosted.org/packages/d5/9c/4f3806d20397790b3cd80aef89d295bf399581804f5c5758b6207e54e902/clang_format-18.1.3-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + +### +NAME=${LINUX_X86_64_URL##*/} + +set -x +curl -L -o "$NAME.zip" "$LINUX_X86_64_URL" +mkdir "clang-format" +unzip -q -d "clang-format" "$NAME.zip" +echo "clang-format/clang_format/data/bin" +deterministic_tar_gz "$NAME.tar.gz" -C "clang-format/clang_format/data/bin" "clang-format" +tar tvf "$NAME.tar.gz" + +echo "$PWD" +export PATH="${PWD}/clang-format/clang_format/data/bin:$PATH" +echo $PATH diff --git a/faiss/impl/NNDescent.cpp b/faiss/impl/NNDescent.cpp index b609aba390..5afcdaf5b7 100644 --- a/faiss/impl/NNDescent.cpp +++ b/faiss/impl/NNDescent.cpp @@ -154,15 +154,20 @@ NNDescent::NNDescent(const int d, const int K) : K(K), d(d) { NNDescent::~NNDescent() {} void NNDescent::join(DistanceComputer& qdis) { + idx_t check_period = InterruptCallback::get_period_hint(d * search_L); + for (idx_t i0 = 0; i0 < (idx_t)ntotal; i0 += check_period) { + idx_t i1 = std::min(i0 + check_period, (idx_t)ntotal); #pragma omp parallel for default(shared) schedule(dynamic, 100) - for (int n = 0; n < ntotal; n++) { - graph[n].join([&](int i, int j) { - if (i != j) { - float dist = qdis.symmetric_dis(i, j); - graph[i].insert(j, dist); - graph[j].insert(i, dist); - } - }); + for (idx_t n = i0; n < i1; n++) { + graph[n].join([&](int i, int j) { + if (i != j) { + float dist = qdis.symmetric_dis(i, j); + graph[i].insert(j, dist); + graph[j].insert(i, dist); + } + }); + } + InterruptCallback::check(); } }