Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enhancement: reduced usage of numpy and substituted built-in libraries #8418

Merged
merged 11 commits into from
Oct 18, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Optional, cast

import numpy as np
from typing import Any, Dict, List, Optional

from haystack.lazy_imports import LazyImport
from haystack.utils.auth import Secret
Expand Down Expand Up @@ -78,5 +76,5 @@ def __init__(
)

def embed(self, data: List[str], **kwargs) -> List[List[float]]:
embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist()
embeddings = self.model.encode(data, **kwargs).tolist()
return embeddings
3 changes: 0 additions & 3 deletions haystack/testing/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import os
import random

import numpy as np

from haystack import logging

logger = logging.getLogger(__name__)
Expand All @@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
"""
random.seed(seed)
np.random.seed(seed)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since numpy is no longer being used to generate random elements , we can safely remove this

os.environ["PYTHONHASHSEED"] = str(seed)

try:
Expand Down
12 changes: 8 additions & 4 deletions haystack/utils/expit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
#
# SPDX-License-Identifier: Apache-2.0

import numpy as np
from numpy import exp


def expit(x: float) -> float:
"""Compute logistic sigmoid function. Maps input values to a range between 0 and 1"""
return 1 / (1 + np.exp(-x))
def expit(x) -> float:
"""
Compute logistic sigmoid function. Maps input values to a range between 0 and 1

:param x: input value. Can be a scalar or a numpy array.
"""
return 1 / (1 + exp(-x))
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Reduced numpy usage to speed up imports.
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import os
from unittest.mock import MagicMock, patch

import random
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random

from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
from haystack.dataclasses import Document
Expand All @@ -24,7 +24,7 @@ def mock_check_valid_model():


def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import os
from unittest.mock import MagicMock, patch

import random
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random

from haystack.components.embedders import HuggingFaceAPITextEmbedder
from haystack.utils.auth import Secret
Expand All @@ -22,7 +22,7 @@ def mock_check_valid_model():


def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response


Expand Down
5 changes: 3 additions & 2 deletions test/components/embedders/test_openai_document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List
from haystack.utils.auth import Secret

import numpy as np
import random
import pytest

from haystack import Document
Expand All @@ -16,7 +16,8 @@ def mock_openai_response(input: List[str], model: str = "text-embedding-ada-002"
dict_response = {
"object": "list",
"data": [
{"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input))
{"object": "embedding", "index": i, "embedding": [random.random() for _ in range(1536)]}
for i in range(len(input))
],
"model": model,
"usage": {"prompt_tokens": 4, "total_tokens": 4},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import MagicMock, patch

import numpy as np
import random
import pytest
import torch

Expand Down Expand Up @@ -264,7 +264,9 @@ def test_warmup_doesnt_reload(self, mocked_factory):
def test_run(self):
embedder = SentenceTransformersDocumentEmbedder(model="model")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]

documents = [Document(content=f"document number {i}") for i in range(5)]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from unittest.mock import MagicMock, patch

import torch
import numpy as np
import random
import pytest

from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
Expand Down Expand Up @@ -239,7 +239,9 @@ def test_warmup_doesnt_reload(self, mocked_factory):
def test_run(self):
embedder = SentenceTransformersTextEmbedder(model="model")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]

text = "a nice text to embed"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import math
from typing import List

import numpy as np
Copy link
Contributor Author

@ajit97singh ajit97singh Sep 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This particular file was importing numpy but no usage was found

import pytest

from haystack import Pipeline
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import Dict, Any

import pytest
import numpy as np

from haystack import Pipeline, DeserializationError
from haystack.document_stores.types import FilterPolicy
Expand Down Expand Up @@ -135,7 +134,7 @@ def test_valid_run(self):

assert "documents" in result
assert len(result["documents"]) == top_k
assert np.array_equal(result["documents"][0].embedding, [1.0, 1.0, 1.0, 1.0])
assert result["documents"][0].embedding == [1.0, 1.0, 1.0, 1.0]

def test_invalid_run_wrong_store_type(self):
SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore")
Expand Down Expand Up @@ -165,4 +164,4 @@ def test_run_with_pipeline(self):
results_docs = result["retriever"]["documents"]
assert results_docs
assert len(results_docs) == top_k
assert np.array_equal(results_docs[0].embedding, [1.0, 1.0, 1.0, 1.0])
assert results_docs[0].embedding == [1.0, 1.0, 1.0, 1.0]
Loading