ci: Fix typos discovered by codespell (#5778)

* Fix typos discovered by codespell * pylint: max-args = 38
deepset-ai · Sep 13, 2023 · 6dd52d9 · 6dd52d9
1 parent 30ca042
commit 6dd52d9
Show file tree

Hide file tree

Showing 44 changed files with 82 additions and 66 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,6 +26,13 @@ repos:
   hooks:
   - id: black-jupyter
 
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.2.5
+  hooks:
+  - id: codespell
+    additional_dependencies:
+      - tomli
+
 - repo: https://github.com/rhysd/actionlint
   rev: v1.6.25
   hooks:

diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base
@@ -43,5 +43,5 @@ ENV PATH="/opt/venv/bin:$PATH"
 RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"
 
 # Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences.
-# We cache these models for seemless user experience.
+# We cache these models for seamless user experience.
 RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()"
diff --git a/docs/pydoc/config/base-document-classifier.yml b/docs/pydoc/config/base-document-classifier.yml
@@ -13,7 +13,7 @@ processors:
   - type: crossref
 renderer:
   type: renderers.ReadmeRenderer
-  excerpt: Abstract class for the Document Classifer.
+  excerpt: Abstract class for the Document Classifier.
   category_slug: haystack-classes
   title: Base Document Classifier API
   slug: base-document-classifier-api

diff --git a/e2e/document_stores/test_update_embeddings.py b/e2e/document_stores/test_update_embeddings.py
@@ -141,7 +141,7 @@ def test_update_embeddings_table_text_retriever(tmp_path):
             assert doc.meta["meta_field"] == "value_table_0"
         np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4)
 
-        # Check if Documents wih different content (text) get different embedding
+        # Check if Documents with different content (text) get different embedding
         documents = ds.get_all_documents(
             filters={"meta_field": ["value_text_1", "value_text_2"]}, return_embedding=True
         )

diff --git a/haystack/agents/conversational.py b/haystack/agents/conversational.py
@@ -107,6 +107,6 @@ def __init__(
     def add_tool(self, tool: Tool):
         if len(self.tm.tools) == 0:
             raise AgentError(
-                "You cannot add tools after initializing the ConversationalAgent without any tools. If you want to add tools, reinitailize the ConversationalAgent and provide `tools`."
+                "You cannot add tools after initializing the ConversationalAgent without any tools. If you want to add tools, reinitialize the ConversationalAgent and provide `tools`."
             )
         return super().add_tool(tool)
diff --git a/haystack/document_stores/elasticsearch/es7.py b/haystack/document_stores/elasticsearch/es7.py
@@ -70,8 +70,8 @@ def __init__(
         :param port: port(s) of elasticsearch nodes
         :param username: username (standard authentication via http_auth)
         :param password: password (standard authentication via http_auth)
-        :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth)
-        :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth)
+        :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth)
+        :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth)
         :param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package)
         :param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one.
         :param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one.

diff --git a/haystack/document_stores/elasticsearch/es8.py b/haystack/document_stores/elasticsearch/es8.py
@@ -79,8 +79,8 @@ def __init__(
         :param port: port(s) of elasticsearch nodes
         :param username: username (standard authentication via http_auth)
         :param password: password (standard authentication via http_auth)
-        :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth)
-        :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth)
+        :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth)
+        :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth)
         :param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package)
         :param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one.
         :param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one.

diff --git a/haystack/document_stores/es_converter.py b/haystack/document_stores/es_converter.py
@@ -72,8 +72,8 @@ def open_search_index_to_document_store(
     :param port: Ports(s) of OpenSearch nodes.
     :param username: Username (standard authentication via http_auth).
     :param password: Password (standard authentication via http_auth).
-    :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth).
-    :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth).
+    :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth).
+    :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth).
     :param aws4auth: Authentication for usage with AWS OpenSearch
         (can be generated with the requests-aws4auth package).
     :param scheme: `"https"` or `"http"`, protocol used to connect to your OpenSearch instance.
@@ -171,8 +171,8 @@ def elasticsearch_index_to_document_store(
     :param port: Ports(s) of Elasticsearch nodes.
     :param username: Username (standard authentication via http_auth).
     :param password: Password (standard authentication via http_auth).
-    :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth).
-    :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth).
+    :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth).
+    :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth).
     :param aws4auth: Authentication for usage with AWS Elasticsearch
         (can be generated with the requests-aws4auth package).
     :param scheme: `"https"` or `"http"`, protocol used to connect to your Elasticsearch instance.

diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py
@@ -73,7 +73,7 @@ def __init__(
                                     exists.
         :param use_gpu: Whether to use a GPU or the CPU for calculating embedding similarity.
                         Falls back to CPU if no GPU is available.
-        :param scoring_batch_size: Batch size of documents to calculate similarity for. Very small batch sizes are inefficent.
+        :param scoring_batch_size: Batch size of documents to calculate similarity for. Very small batch sizes are inefficient.
                                    Very large batch sizes can overrun GPU memory. In general you want to make sure
                                    you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory.
                                    Since the data is originally stored in CPU memory there is little risk of overruning memory

diff --git a/haystack/document_stores/opensearch.py b/haystack/document_stores/opensearch.py
@@ -82,8 +82,8 @@ def __init__(
         :param port: port(s) of OpenSearch nodes
         :param username: username (standard authentication via http_auth)
         :param password: password (standard authentication via http_auth)
-        :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth)
-        :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth)
+        :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth)
+        :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth)
         :param aws4auth: Authentication for usage with AWS OpenSearch Service (can be generated with the requests-aws4auth package)
         :param index: Name of index in OpenSearch to use for storing the documents that we want to search. If not existing yet, we will create one.
         :param label_index: Name of index in OpenSearch to use for storing labels. If not existing yet, we will create one.
@@ -1299,7 +1299,7 @@ def _get_raw_similarity_score(self, score):
         # adjust scores according to https://opensearch.org/docs/latest/search-plugins/knn/approximate-knn
         # and https://opensearch.org/docs/latest/search-plugins/knn/knn-score-script/
 
-        # space type is required as criterion as there is no consistent similarity-to-space-type mapping accross knn engines
+        # space type is required as criterion as there is no consistent similarity-to-space-type mapping across knn engines
         if self.space_type == "innerproduct":
             if score > 1:
                 score = score - 1
@@ -1419,7 +1419,7 @@ def _train_ivf_index(
 
     def _recommended_ivf_train_size(self) -> int:
         """
-        Calculates the minumum recommended number of training samples for IVF training as suggested in FAISS docs.
+        Calculates the minimum recommended number of training samples for IVF training as suggested in FAISS docs.
         https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids
         """
         min_points_per_cluster = 39

diff --git a/haystack/document_stores/search_engine.py b/haystack/document_stores/search_engine.py
@@ -206,7 +206,7 @@ def _bulk(
         except Exception as e:
             if hasattr(e, "status_code") and e.status_code == 429:  # type: ignore
                 logger.warning(
-                    "Failed to insert a batch of '%s' documents because of a 'Too Many Requeset' response. "
+                    "Failed to insert a batch of '%s' documents because of a 'Too Many Requests' response. "
                     "Splitting the number of documents into two chunks with the same size and retrying in %s seconds.",
                     len(documents),
                     _timeout,

diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py
@@ -110,7 +110,7 @@ def _get_dataset(self, filename: Optional[Union[str, Path]], dicts: Optional[Lis
         # loading dicts from file (default)
         if dicts is None:
             dicts = list(self.processor.file_to_dicts(filename))  # type: ignore
-            # shuffle list of dicts here if we later want to have a random dev set splitted from train set
+            # shuffle list of dicts here if we later want to have a random dev set split from train set
             if str(self.processor.train_filename) in str(filename):
                 if not self.processor.dev_filename:
                     if self.processor.dev_split > 0.0:

diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py
@@ -387,7 +387,7 @@ def __init__(
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
         :param data_dir: The directory in which the train and dev files can be found.
-                         If not available the dataset will be loaded automaticaly
+                         If not available the dataset will be loaded automatically
                          if the last directory has the same name as a predefined dataset.
                          These predefined datasets are defined as the keys in the dict at
                          `haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
@@ -575,7 +575,7 @@ def _split_docs_into_passages(self, baskets: List[SampleBasket]):
                 )
             except Exception as e:
                 logger.warning(
-                    "Could not devide document into passages. Document: %s\nWith error: %s",
+                    "Could not divide document into passages. Document: %s\nWith error: %s",
                     basket.raw["document_text"][:200],
                     e,
                 )
@@ -880,7 +880,7 @@ def __init__(
         :param max_seq_len_query: Query samples are truncated after this many tokens.
         :param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens.
         :param data_dir: The directory in which the train and dev files can be found.
-                         If not available the dataset will be loaded automaticaly
+                         If not available the dataset will be loaded automatically
                          if the last directory has the same name as a predefined dataset.
                          These predefined datasets are defined as the keys in the dict at
                          `haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
@@ -1038,7 +1038,7 @@ def dataset_from_dicts(
         # Take the dict and insert into our basket structure, this stages also adds an internal IDs
         baskets = self._fill_baskets(dicts, indices)
 
-        # Separat conversion of query
+        # Separate conversion of query
         baskets = self._convert_queries(baskets=baskets)
 
         # and context passages. When converting the context the label is also assigned.
@@ -1799,7 +1799,7 @@ def __init__(
         :param max_seq_len: Samples are truncated after this many tokens.
         :type max_seq_len: int
         :param data_dir: The directory in which the train and dev files can be found.
-                         If not available the dataset will be loaded automaticaly
+                         If not available the dataset will be loaded automatically
                          if the last directory has the same name as a predefined dataset.
                          These predefined datasets are defined as the keys in the dict at
                          `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/main/farm/data_handler/utils.py>`_.
@@ -2151,7 +2151,7 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None
                         dev_labels[q["id"]] = q["answers"][0]["text"]
         not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
         if len(not_included) > 0:
-            logger.info("There were missing predicitons for question ids: %s", list(not_included))
+            logger.info("There were missing predictions for question ids: %s", list(not_included))
         for x in not_included:
             predictions_json[x] = ""
 

diff --git a/haystack/modeling/data_handler/samples.py b/haystack/modeling/data_handler/samples.py
@@ -187,7 +187,7 @@ def offset_to_token_idx_vecorized(token_offsets, ch_idx):
     if ch_idx >= np.max(token_offsets):
         # idx must be including
         idx = np.argmax(token_offsets)
-    # looking for the first occurence of token_offsets larger than ch_idx and taking one position to the left.
+    # looking for the first occurrence of token_offsets larger than ch_idx and taking one position to the left.
     # This is needed to overcome n special_tokens at start of sequence
     # and failsafe matching (the character start might not always coincide with a token offset, e.g. when starting at whitespace)
     else:

diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py
@@ -304,7 +304,7 @@ def inference_from_dicts(
         :return: list of predictions
         """
         # whether to aggregate predictions across different samples (e.g. for QA on long texts)
-        # TODO remove or adjust after implmenting input objects properly
+        # TODO remove or adjust after implementing input objects properly
         # if set(dicts[0].keys()) == {"qas", "context"}:
         #     warnings.warn("QA Input dictionaries with [qas, context] as keys will be deprecated in the future",
         #                   DeprecationWarning)

diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py
@@ -568,7 +568,7 @@ def forward_lm(self, **kwargs):
 
     def log_params(self):
         """
-        Logs parameteres to generic logger MlLogger
+        Logs parameters to generic logger MlLogger
         """
         params = {
             "lm_type": self.language_model.__class__.__name__,

diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py
@@ -367,7 +367,7 @@ def forward_lm(
 
     def log_params(self):
         """
-        Logs paramteres to generic logger MlLogger
+        Logs parameters to generic logger MlLogger
         """
         params = {
             "lm1_type": self.language_model1.__class__.__name__,

diff --git a/haystack/modeling/model/feature_extraction.py b/haystack/modeling/model/feature_extraction.py
@@ -382,7 +382,7 @@ def _words_to_tokens(
         first_token = True
         for token in tokens_word:
             token_offsets.append(word_offset)
-            # Depending on the tokenizer type special chars are added to distinguish tokens with preceeding
+            # Depending on the tokenizer type special chars are added to distinguish tokens with preceding
             # whitespace (=> "start of a word"). We need to get rid of these to calculate the original length of the token
             original_token = re.sub(SPECIAL_TOKENIZER_CHARS, "", token)
             # Don't use length of unk token for offset calculation

diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py
@@ -153,7 +153,7 @@ def formatted_preds(
         """
         Extracting vectors from a language model (for example, for extracting sentence embeddings).
         You can use different pooling strategies and layers by specifying them in the object attributes
-        `extraction_layer` and `extraction_strategy`. You should set both these attirbutes using the Inferencer:
+        `extraction_layer` and `extraction_strategy`. You should set both these attributes using the Inferencer:
         Example:  Inferencer(extraction_strategy='cls_token', extraction_layer=-1)
 
         :param logits: Tuple of (sequence_output, pooled_output) from the language model.

diff --git a/haystack/modeling/model/optimization.py b/haystack/modeling/model/optimization.py
@@ -142,7 +142,7 @@ def initialize_optimizer(
     # Adjust for parallel training
     model, optimizer = optimize_model(model, device, local_rank, optimizer, distributed)
 
-    # Get learning rate schedule - moved below to supress warning
+    # Get learning rate schedule - moved below to suppress warning
     scheduler = get_scheduler(optimizer, schedule_opts)
 
     return model, optimizer, scheduler

diff --git a/haystack/modeling/model/prediction_head.py b/haystack/modeling/model/prediction_head.py
@@ -864,7 +864,7 @@ def pred_to_doc_idxs(pred, passage_start_t):
         """
         Converts the passage level predictions to document level predictions. Note that on the doc level we
         don't have special tokens or question tokens. This means that a no answer
-        cannot be prepresented by a (0,0) qa_answer but will instead be represented by (-1, -1)
+        cannot be represented by a (0,0) qa_answer but will instead be represented by (-1, -1)
         """
         new_pred = []
         for qa_answer in pred:
@@ -891,7 +891,7 @@ def label_to_doc_idxs(label, passage_start_t):
         """
         Converts the passage level labels to document level labels. Note that on the doc level we
         don't have special tokens or question tokens. This means that a no answer
-        cannot be prepresented by a (0,0) span but will instead be represented by (-1, -1)
+        cannot be represented by a (0,0) span but will instead be represented by (-1, -1)
         """
         new_label = []
         for start, end in label:

diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py
@@ -391,7 +391,7 @@ def forward_lm(self, **kwargs):
 
     def log_params(self):
         """
-        Logs paramteres to generic logger MlLogger
+        Logs parameters to generic logger MlLogger
         """
         params = {
             "lm1_type": self.language_model1.__class__.__name__,

diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py
@@ -205,7 +205,7 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
     param_fields.pop(0)
     param_fields_kwargs: Dict[str, Any] = {}
 
-    # Read all the paramteres extracted from the __init__ method with type and default value
+    # Read all the parameters extracted from the __init__ method with type and default value
     for param in param_fields:
         annotation = Any
         if param.annotation != param.empty:

diff --git a/haystack/nodes/audio/whisper_transcriber.py b/haystack/nodes/audio/whisper_transcriber.py
@@ -52,7 +52,7 @@ def __init__(
 
         :param api_key: OpenAI API key. If None, a local installation of Whisper is used.
         :param model_name_or_path: Name of the model to use. If using a local installation of Whisper, set this to one of the following values: "tiny", "small", "medium", "large", "large-v2". If using
-        the API, set thsi value to: "whisper-1" (default).
+        the API, set this value to: "whisper-1" (default).
         :param device: Device to use for inference. Only used if you're using a local
         installation of Whisper. If None, the device is automatically selected.
         :param api_base: The OpenAI API Base url, defaults to `https://api.openai.com/v1`.