Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python Type Enhancements #364

Merged
merged 9 commits into from
Jun 1, 2023
Merged
76 changes: 61 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,20 +139,63 @@ if (MSVC)
"${DISKANN_MKL_LIB_PATH}/mkl_intel_thread.lib")
else()
# expected path for manual intel mkl installs
set(OMP_PATH /opt/intel/oneapi/compiler/2022.0.2/linux/compiler/lib/intel64_lin/ CACHE PATH "Intel OneAPI OpenMP library implementation path")
set(MKL_ROOT /opt/intel/oneapi/mkl/latest CACHE PATH "Intel OneAPI MKL library implementation path")
link_directories(${OMP_PATH} ${MKL_ROOT}/lib/intel64)
include_directories(${MKL_ROOT}/include)
# todo: make sure that the link_directories and include_directories for mkl also include paths that are used
# when using a RH derivative distro (for python build through cibuildwheel)

# expected path for apt packaged intel mkl installs
link_directories(/usr/lib/x86_64-linux-gnu/mkl)
include_directories(/usr/include/mkl)
set(POSSIBLE_OMP_PATHS "/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so;/usr/lib/x86_64-linux-gnu/libiomp5.so;/opt/intel/lib/intel64_lin/libiomp5.so")
foreach(POSSIBLE_OMP_PATH ${POSSIBLE_OMP_PATHS})
if (EXISTS ${POSSIBLE_OMP_PATH})
get_filename_component(OMP_PATH ${POSSIBLE_OMP_PATH} DIRECTORY)
endif()
endforeach()

if(NOT OMP_PATH)
message(FATAL_ERROR "Could not find Intel OMP in standard locations; use -DOMP_PATH to specify the install location for your environment")
endif()
link_directories(${OMP_PATH})

set(POSSIBLE_MKL_LIB_PATHS "/opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_core.so;/usr/lib/x86_64-linux-gnu/libmkl_core.so;/opt/intel/mkl/lib/intel64/libmkl_core.so")
foreach(POSSIBLE_MKL_LIB_PATH ${POSSIBLE_MKL_LIB_PATHS})
if (EXISTS ${POSSIBLE_MKL_LIB_PATH})
get_filename_component(MKL_PATH ${POSSIBLE_MKL_LIB_PATH} DIRECTORY)
endif()
endforeach()

set(POSSIBLE_MKL_INCLUDE_PATHS "/opt/intel/oneapi/mkl/latest/include;/usr/include/mkl;/opt/intel/mkl/include/;")
foreach(POSSIBLE_MKL_INCLUDE_PATH ${POSSIBLE_MKL_INCLUDE_PATHS})
if (EXISTS ${POSSIBLE_MKL_INCLUDE_PATH})
set(MKL_INCLUDE_PATH ${POSSIBLE_MKL_INCLUDE_PATH})
endif()
endforeach()
if(NOT MKL_PATH)
message(FATAL_ERROR "Could not find Intel MKL in standard locations; use -DMKL_PATH to specify the install location for your environment")
elseif(NOT MKL_INCLUDE_PATH)
message(FATAL_ERROR "Could not find Intel MKL in standard locations; use -DMKL_INCLUDE_PATH to specify the install location for headers for your environment")
endif()
if (EXISTS ${MKL_PATH}/libmkl_def.so.2)
set(MKL_DEF_SO ${MKL_PATH}/libmkl_def.so.2)
elseif(EXISTS ${MKL_PATH}/libmkl_def.so)
set(MKL_DEF_SO ${MKL_PATH}/libmkl_def.so)
else()
message(FATAL_ERROR "Despite finding MKL, libmkl_def.so was not found in expected locations.")
endif()
link_directories(${MKL_PATH})
include_directories(${MKL_INCLUDE_PATH})

# compile flags and link libraries
add_compile_options(-m64 -Wl,--no-as-needed)
link_libraries(mkl_intel_ilp64 mkl_intel_thread mkl_core iomp5 pthread m dl)
if (NOT PYBIND)
link_libraries(mkl_intel_ilp64 mkl_intel_thread mkl_core iomp5 pthread m dl)
else()
# static linking for python so as to minimize customer dependency issues
link_libraries(
${MKL_PATH}/libmkl_intel_ilp64.a
${MKL_PATH}/libmkl_intel_thread.a
${MKL_PATH}/libmkl_core.a
${MKL_DEF_SO}
iomp5
pthread
m
dl
)
endif()
endif()

add_definitions(-DMKL_ILP64)
Expand Down Expand Up @@ -198,7 +241,7 @@ if (MSVC)
add_dependencies(libtcmalloc_minimal_for_exe build_libtcmalloc_minimal)

set(DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS libtcmalloc_minimal_for_exe)
else()
elseif(NOT PYBIND)
set(DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS "-ltcmalloc")
endif()

Expand Down Expand Up @@ -239,11 +282,14 @@ else()
set(ENV{TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD} 500000000000)
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG -O0 -fsanitize=address -fsanitize=leak -fsanitize=undefined")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG -Wall -Wextra")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -DNDEBUG -march=native -mtune=native -ftree-vectorize")
add_compile_options(-march=native -Wall -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -DUSE_AVX2)
if (PYBIND)
if (NOT PYBIND)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -DNDEBUG -march=native -mtune=native -ftree-vectorize")
else()
#-Ofast is super problematic for python. see: https://moyix.blogspot.com/2022/09/someones-been-messing-with-my-subnormals.html
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -march=native -mtune=native -ftree-vectorize")
add_compile_options(-fPIC)
endif()
add_compile_options(-march=native -Wall -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -DUSE_AVX2)
endif()

add_subdirectory(src)
Expand Down
23 changes: 10 additions & 13 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,16 @@ package-dir = {"" = "python/src"}
manylinux-x86_64-image = "manylinux_2_28"
test-requires = ["scikit-learn~=1.2"]
build-frontend = "build"
skip = "pp* *musllinux*"
skip = ["pp*", "*-win32", "*-manylinux_i686", "*-musllinux*"]
test-command = "python -m unittest discover {project}/python/tests"


[tool.cibuildwheel.linux]
before-all = """\
dnf makecache --refresh && \
dnf install -y epel-release && \
dnf config-manager -y --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo && \
rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB && \
dnf makecache --refresh -y && \
dnf install -y wget make cmake gcc-c++ libaio-devel gperftools-libs libunwind-devel clang-tools-extra boost-devel boost-program-options intel-mkl-2020.4-912
"""

test-command = """\
LD_PRELOAD="/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_ilp64.so:/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libiomp5.so:/lib/x86_64-linux-gnu/libmkl_avx2.so:/lib/x86_64-linux-gnu/libmkl_def.so" python -m unittest discover python/tests
"""
before-build = [
"dnf makecache --refresh",
"dnf install -y epel-release",
"dnf config-manager -y --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo",
"rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB",
"dnf makecache --refresh -y",
"dnf install -y wget make cmake gcc-c++ libaio-devel gperftools-libs libunwind-devel clang-tools-extra boost-devel boost-program-options intel-mkl-2020.4-912"
]
152 changes: 152 additions & 0 deletions python/apps/cli/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import diskannpy as dap
import numpy as np
import numpy.typing as npt

import fire

from contextlib import contextmanager
from time import perf_counter

from typing import Tuple


def _basic_setup(
dtype: str,
query_vectors_file: str
) -> Tuple[dap.VectorDType, npt.NDArray[dap.VectorDType]]:
_dtype = dap.valid_dtype(dtype)
vectors_to_query = dap.vectors_from_binary(query_vectors_file, dtype=_dtype)
return _dtype, vectors_to_query


def dynamic(
dtype: str,
index_vectors_file: str,
query_vectors_file: str,
build_complexity: int,
graph_degree: int,
K: int,
search_complexity: int,
num_insert_threads: int,
num_search_threads: int,
gt_file: str = "",
):
_dtype, vectors_to_query = _basic_setup(dtype, query_vectors_file)
vectors_to_index = dap.vectors_from_binary(index_vectors_file, dtype=_dtype)

npts, ndims = vectors_to_index.shape
index = dap.DynamicMemoryIndex(
"l2", _dtype, ndims, npts, build_complexity, graph_degree
)

tags = np.arange(1, npts+1, dtype=np.uintc)
timer = Timer()

with timer.time("batch insert"):
index.batch_insert(vectors_to_index, tags, num_insert_threads)

delete_tags = np.random.choice(
np.array(range(1, npts + 1, 1), dtype=np.uintc),
size=int(0.5 * npts),
replace=False
)
with timer.time("mark deletion"):
for tag in delete_tags:
index.mark_deleted(tag)

with timer.time("consolidation"):
index.consolidate_delete()

deleted_data = vectors_to_index[delete_tags - 1, :]

with timer.time("re-insertion"):
index.batch_insert(deleted_data, delete_tags, num_insert_threads)

with timer.time("batch searched"):
tags, dists = index.batch_search(vectors_to_query, K, search_complexity, num_search_threads)

# res_ids = tags - 1
# if gt_file != "":
# recall = utils.calculate_recall_from_gt_file(K, res_ids, gt_file)
# print(f"recall@{K} is {recall}")

def static(
dtype: str,
index_directory: str,
index_vectors_file: str,
query_vectors_file: str,
build_complexity: int,
graph_degree: int,
K: int,
search_complexity: int,
num_threads: int,
gt_file: str = "",
index_prefix: str = "ann"
):
_dtype, vectors_to_query = _basic_setup(dtype, query_vectors_file)
timer = Timer()
with timer.time("build static index"):
# build index
dap.build_memory_index(
data=index_vectors_file,
metric="l2",
vector_dtype=_dtype,
index_directory=index_directory,
complexity=build_complexity,
graph_degree=graph_degree,
num_threads=num_threads,
index_prefix=index_prefix,
alpha=1.2,
use_pq_build=False,
num_pq_bytes=8,
use_opq=False,
)

with timer.time("load static index"):
# ready search object
index = dap.StaticMemoryIndex(
metric="l2",
vector_dtype=_dtype,
data_path=index_vectors_file,
index_directory=index_directory,
num_threads=num_threads, # this can be different at search time if you would like
initial_search_complexity=search_complexity,
index_prefix=index_prefix
)

ids, dists = index.batch_search(vectors_to_query, K, search_complexity, num_threads)

# if gt_file != "":
# recall = utils.calculate_recall_from_gt_file(K, ids, gt_file)
# print(f"recall@{K} is {recall}")

def dynamic_clustered():
pass

def generate_clusters():
pass


class Timer:
def __init__(self):
self._start = -1

@contextmanager
def time(self, message: str):
start = perf_counter()
if self._start == -1:
self._start = start
yield
now = perf_counter()
print(f"Operation {message} completed in {(now - start):.3f}s, total: {(now - self._start):.3f}s")




if __name__ == "__main__":
fire.Fire({
"in-mem-dynamic": dynamic,
"in-mem-static": static,
"in-mem-dynamic-clustered": dynamic_clustered,
"generate-clusters": generate_clusters
}, name="cli")
12 changes: 8 additions & 4 deletions python/apps/in-mem-dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,11 @@ def insert_and_search(
args.gt_file,
)

# An ingest optimized example with SIFT1M
# python3 ~/DiskANN/python/apps/in-mem-dynamic.py -d float \
# -i sift_base.fbin -q sift_query.fbin --gt_file gt100_base \
# -Lb 10 -R 30 -Ls 200
"""
An ingest optimized example with SIFT1M
source venv/bin/activate
python python/apps/in-mem-dynamic.py -d float \
-i "$HOME/data/sift/sift_base.fbin" -q "$HOME/data/sift/sift_query.fbin" --gt_file "$HOME/data/sift/gt100_base" \
-Lb 10 -R 30 -Ls 200
"""

5 changes: 2 additions & 3 deletions python/apps/in-mem-static.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def build_and_search(
if not search_only:
diskannpy.build_memory_index(
data=indexdata_file,
metric=metric,
distance_metric=metric,
vector_dtype=dtype,
index_directory=index_directory,
complexity=Lb,
Expand All @@ -52,9 +52,8 @@ def build_and_search(

# ready search object
index = diskannpy.StaticMemoryIndex(
metric=metric,
distance_metric=metric,
vector_dtype=dtype,
data_path=indexdata_file,
index_directory=index_directory,
num_threads=num_threads, # this can be different at search time if you would like
initial_search_complexity=Ls,
Expand Down
2 changes: 2 additions & 0 deletions python/apps/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
diskannpy
fire
17 changes: 14 additions & 3 deletions python/src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@
build_memory_index,
numpy_to_diskann_file,
)
from ._common import VectorDType
from ._disk_index import DiskIndex
from ._diskannpy import INNER_PRODUCT, L2, Metric, defaults
from ._common import (
DistanceMetric,
QueryResponse,
QueryResponseBatch,
VectorDType,
VectorIdentifier,
VectorIdentifierBatch,
VectorLike,
VectorLikeBatch,
valid_dtype
)
from ._diskannpy import defaults
from ._dynamic_memory_index import DynamicMemoryIndex
from ._files import vectors_from_binary, vector_file_metadata
from ._static_disk_index import StaticDiskIndex
from ._static_memory_index import StaticMemoryIndex
Loading