From 6dd52d91b2c0d8db9682b8d2901be99b098c8d42 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 13 Sep 2023 16:14:45 +0200 Subject: [PATCH] ci: Fix typos discovered by codespell (#5778) * Fix typos discovered by codespell * pylint: max-args = 38 --- .pre-commit-config.yaml | 7 +++++++ docker/Dockerfile.base | 2 +- docs/pydoc/config/base-document-classifier.yml | 2 +- e2e/document_stores/test_update_embeddings.py | 2 +- haystack/agents/conversational.py | 2 +- haystack/document_stores/elasticsearch/es7.py | 4 ++-- haystack/document_stores/elasticsearch/es8.py | 4 ++-- haystack/document_stores/es_converter.py | 8 ++++---- haystack/document_stores/memory.py | 2 +- haystack/document_stores/opensearch.py | 8 ++++---- haystack/document_stores/search_engine.py | 2 +- haystack/modeling/data_handler/data_silo.py | 2 +- haystack/modeling/data_handler/processor.py | 12 ++++++------ haystack/modeling/data_handler/samples.py | 2 +- haystack/modeling/infer.py | 2 +- haystack/modeling/model/adaptive_model.py | 2 +- haystack/modeling/model/biadaptive_model.py | 2 +- haystack/modeling/model/feature_extraction.py | 2 +- haystack/modeling/model/language_model.py | 2 +- haystack/modeling/model/optimization.py | 2 +- haystack/modeling/model/prediction_head.py | 4 ++-- haystack/modeling/model/triadaptive_model.py | 2 +- haystack/nodes/_json_schema.py | 2 +- haystack/nodes/audio/whisper_transcriber.py | 2 +- haystack/nodes/base.py | 2 +- haystack/nodes/connector/crawler.py | 2 +- haystack/nodes/image_to_text/transformers.py | 2 +- .../prompt/invocation_layer/hugging_face.py | 2 +- .../invocation_layer/sagemaker_hf_text_gen.py | 2 +- haystack/nodes/prompt/prompt_template.py | 2 +- haystack/nodes/reader/farm.py | 2 +- haystack/utils/deepsetcloud.py | 16 ++++++++-------- haystack/utils/squad_data.py | 2 +- proposals/text/3875-table-cell.md | 2 +- proposals/text/4284-drop-basecomponent.md | 6 +++--- .../text/4370-documentstores-and-retrievers.md | 2 +- proposals/text/5390-embedders.md | 2 +- pyproject.toml | 7 ++++++- .../notes/codespell-d4a32b9c589ca26e.yaml | 4 ++++ rest_api/rest_api/controller/feedback.py | 2 +- test/conftest.py | 2 +- test/document_stores/test_opensearch.py | 4 ++-- test/prompt/test_prompt_node.py | 2 +- test/samples/markdown/sample.md | 2 +- 44 files changed, 82 insertions(+), 66 deletions(-) create mode 100644 releasenotes/notes/codespell-d4a32b9c589ca26e.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 228f5ec094..894ed40ace 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,6 +26,13 @@ repos: hooks: - id: black-jupyter +- repo: https://github.com/codespell-project/codespell + rev: v2.2.5 + hooks: + - id: codespell + additional_dependencies: + - tomli + - repo: https://github.com/rhysd/actionlint rev: v1.6.25 hooks: diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index d58bded526..6ad4378c30 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -43,5 +43,5 @@ ENV PATH="/opt/venv/bin:$PATH" RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()" # Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences. -# We cache these models for seemless user experience. +# We cache these models for seamless user experience. RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()" diff --git a/docs/pydoc/config/base-document-classifier.yml b/docs/pydoc/config/base-document-classifier.yml index 1e8fd809b2..5c3726ccf5 100644 --- a/docs/pydoc/config/base-document-classifier.yml +++ b/docs/pydoc/config/base-document-classifier.yml @@ -13,7 +13,7 @@ processors: - type: crossref renderer: type: renderers.ReadmeRenderer - excerpt: Abstract class for the Document Classifer. + excerpt: Abstract class for the Document Classifier. category_slug: haystack-classes title: Base Document Classifier API slug: base-document-classifier-api diff --git a/e2e/document_stores/test_update_embeddings.py b/e2e/document_stores/test_update_embeddings.py index 61a492c124..0666bd432a 100644 --- a/e2e/document_stores/test_update_embeddings.py +++ b/e2e/document_stores/test_update_embeddings.py @@ -141,7 +141,7 @@ def test_update_embeddings_table_text_retriever(tmp_path): assert doc.meta["meta_field"] == "value_table_0" np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4) - # Check if Documents wih different content (text) get different embedding + # Check if Documents with different content (text) get different embedding documents = ds.get_all_documents( filters={"meta_field": ["value_text_1", "value_text_2"]}, return_embedding=True ) diff --git a/haystack/agents/conversational.py b/haystack/agents/conversational.py index 5701661be6..fce1f41647 100644 --- a/haystack/agents/conversational.py +++ b/haystack/agents/conversational.py @@ -107,6 +107,6 @@ def __init__( def add_tool(self, tool: Tool): if len(self.tm.tools) == 0: raise AgentError( - "You cannot add tools after initializing the ConversationalAgent without any tools. If you want to add tools, reinitailize the ConversationalAgent and provide `tools`." + "You cannot add tools after initializing the ConversationalAgent without any tools. If you want to add tools, reinitialize the ConversationalAgent and provide `tools`." ) return super().add_tool(tool) diff --git a/haystack/document_stores/elasticsearch/es7.py b/haystack/document_stores/elasticsearch/es7.py index 970a0abc16..730f5bce22 100644 --- a/haystack/document_stores/elasticsearch/es7.py +++ b/haystack/document_stores/elasticsearch/es7.py @@ -70,8 +70,8 @@ def __init__( :param port: port(s) of elasticsearch nodes :param username: username (standard authentication via http_auth) :param password: password (standard authentication via http_auth) - :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth) - :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth) + :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth) + :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth) :param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) :param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. :param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. diff --git a/haystack/document_stores/elasticsearch/es8.py b/haystack/document_stores/elasticsearch/es8.py index 47c96ab170..4454072137 100644 --- a/haystack/document_stores/elasticsearch/es8.py +++ b/haystack/document_stores/elasticsearch/es8.py @@ -79,8 +79,8 @@ def __init__( :param port: port(s) of elasticsearch nodes :param username: username (standard authentication via http_auth) :param password: password (standard authentication via http_auth) - :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth) - :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth) + :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth) + :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth) :param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) :param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. :param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. diff --git a/haystack/document_stores/es_converter.py b/haystack/document_stores/es_converter.py index 87a8e65a7e..1d242c92fa 100644 --- a/haystack/document_stores/es_converter.py +++ b/haystack/document_stores/es_converter.py @@ -72,8 +72,8 @@ def open_search_index_to_document_store( :param port: Ports(s) of OpenSearch nodes. :param username: Username (standard authentication via http_auth). :param password: Password (standard authentication via http_auth). - :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth). - :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth). + :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth). + :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth). :param aws4auth: Authentication for usage with AWS OpenSearch (can be generated with the requests-aws4auth package). :param scheme: `"https"` or `"http"`, protocol used to connect to your OpenSearch instance. @@ -171,8 +171,8 @@ def elasticsearch_index_to_document_store( :param port: Ports(s) of Elasticsearch nodes. :param username: Username (standard authentication via http_auth). :param password: Password (standard authentication via http_auth). - :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth). - :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth). + :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth). + :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth). :param aws4auth: Authentication for usage with AWS Elasticsearch (can be generated with the requests-aws4auth package). :param scheme: `"https"` or `"http"`, protocol used to connect to your Elasticsearch instance. diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index 563c6db87c..56cc40e692 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -73,7 +73,7 @@ def __init__( exists. :param use_gpu: Whether to use a GPU or the CPU for calculating embedding similarity. Falls back to CPU if no GPU is available. - :param scoring_batch_size: Batch size of documents to calculate similarity for. Very small batch sizes are inefficent. + :param scoring_batch_size: Batch size of documents to calculate similarity for. Very small batch sizes are inefficient. Very large batch sizes can overrun GPU memory. In general you want to make sure you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory. Since the data is originally stored in CPU memory there is little risk of overruning memory diff --git a/haystack/document_stores/opensearch.py b/haystack/document_stores/opensearch.py index c6a683fb46..0b05bbb8be 100644 --- a/haystack/document_stores/opensearch.py +++ b/haystack/document_stores/opensearch.py @@ -82,8 +82,8 @@ def __init__( :param port: port(s) of OpenSearch nodes :param username: username (standard authentication via http_auth) :param password: password (standard authentication via http_auth) - :param api_key_id: ID of the API key (altenative authentication mode to the above http_auth) - :param api_key: Secret value of the API key (altenative authentication mode to the above http_auth) + :param api_key_id: ID of the API key (alternative authentication mode to the above http_auth) + :param api_key: Secret value of the API key (alternative authentication mode to the above http_auth) :param aws4auth: Authentication for usage with AWS OpenSearch Service (can be generated with the requests-aws4auth package) :param index: Name of index in OpenSearch to use for storing the documents that we want to search. If not existing yet, we will create one. :param label_index: Name of index in OpenSearch to use for storing labels. If not existing yet, we will create one. @@ -1299,7 +1299,7 @@ def _get_raw_similarity_score(self, score): # adjust scores according to https://opensearch.org/docs/latest/search-plugins/knn/approximate-knn # and https://opensearch.org/docs/latest/search-plugins/knn/knn-score-script/ - # space type is required as criterion as there is no consistent similarity-to-space-type mapping accross knn engines + # space type is required as criterion as there is no consistent similarity-to-space-type mapping across knn engines if self.space_type == "innerproduct": if score > 1: score = score - 1 @@ -1419,7 +1419,7 @@ def _train_ivf_index( def _recommended_ivf_train_size(self) -> int: """ - Calculates the minumum recommended number of training samples for IVF training as suggested in FAISS docs. + Calculates the minimum recommended number of training samples for IVF training as suggested in FAISS docs. https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids """ min_points_per_cluster = 39 diff --git a/haystack/document_stores/search_engine.py b/haystack/document_stores/search_engine.py index 4821cb07a9..9b212d82ee 100644 --- a/haystack/document_stores/search_engine.py +++ b/haystack/document_stores/search_engine.py @@ -206,7 +206,7 @@ def _bulk( except Exception as e: if hasattr(e, "status_code") and e.status_code == 429: # type: ignore logger.warning( - "Failed to insert a batch of '%s' documents because of a 'Too Many Requeset' response. " + "Failed to insert a batch of '%s' documents because of a 'Too Many Requests' response. " "Splitting the number of documents into two chunks with the same size and retrying in %s seconds.", len(documents), _timeout, diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py index c9db096ade..bb9c273512 100644 --- a/haystack/modeling/data_handler/data_silo.py +++ b/haystack/modeling/data_handler/data_silo.py @@ -110,7 +110,7 @@ def _get_dataset(self, filename: Optional[Union[str, Path]], dicts: Optional[Lis # loading dicts from file (default) if dicts is None: dicts = list(self.processor.file_to_dicts(filename)) # type: ignore - # shuffle list of dicts here if we later want to have a random dev set splitted from train set + # shuffle list of dicts here if we later want to have a random dev set split from train set if str(self.processor.train_filename) in str(filename): if not self.processor.dev_filename: if self.processor.dev_split > 0.0: diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index 710a690466..6d5cda47fb 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -387,7 +387,7 @@ def __init__( :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :param data_dir: The directory in which the train and dev files can be found. - If not available the dataset will be loaded automaticaly + If not available the dataset will be loaded automatically if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP `_. @@ -575,7 +575,7 @@ def _split_docs_into_passages(self, baskets: List[SampleBasket]): ) except Exception as e: logger.warning( - "Could not devide document into passages. Document: %s\nWith error: %s", + "Could not divide document into passages. Document: %s\nWith error: %s", basket.raw["document_text"][:200], e, ) @@ -880,7 +880,7 @@ def __init__( :param max_seq_len_query: Query samples are truncated after this many tokens. :param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens. :param data_dir: The directory in which the train and dev files can be found. - If not available the dataset will be loaded automaticaly + If not available the dataset will be loaded automatically if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP `_. @@ -1038,7 +1038,7 @@ def dataset_from_dicts( # Take the dict and insert into our basket structure, this stages also adds an internal IDs baskets = self._fill_baskets(dicts, indices) - # Separat conversion of query + # Separate conversion of query baskets = self._convert_queries(baskets=baskets) # and context passages. When converting the context the label is also assigned. @@ -1799,7 +1799,7 @@ def __init__( :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. - If not available the dataset will be loaded automaticaly + If not available the dataset will be loaded automatically if the last directory has the same name as a predefined dataset. These predefined datasets are defined as the keys in the dict at `farm.data_handler.utils.DOWNSTREAM_TASK_MAP `_. @@ -2151,7 +2151,7 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None dev_labels[q["id"]] = q["answers"][0]["text"] not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys())) if len(not_included) > 0: - logger.info("There were missing predicitons for question ids: %s", list(not_included)) + logger.info("There were missing predictions for question ids: %s", list(not_included)) for x in not_included: predictions_json[x] = "" diff --git a/haystack/modeling/data_handler/samples.py b/haystack/modeling/data_handler/samples.py index ebc3d5d2f5..0fd7402539 100644 --- a/haystack/modeling/data_handler/samples.py +++ b/haystack/modeling/data_handler/samples.py @@ -187,7 +187,7 @@ def offset_to_token_idx_vecorized(token_offsets, ch_idx): if ch_idx >= np.max(token_offsets): # idx must be including idx = np.argmax(token_offsets) - # looking for the first occurence of token_offsets larger than ch_idx and taking one position to the left. + # looking for the first occurrence of token_offsets larger than ch_idx and taking one position to the left. # This is needed to overcome n special_tokens at start of sequence # and failsafe matching (the character start might not always coincide with a token offset, e.g. when starting at whitespace) else: diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py index fe53a50b66..81b159c80e 100644 --- a/haystack/modeling/infer.py +++ b/haystack/modeling/infer.py @@ -304,7 +304,7 @@ def inference_from_dicts( :return: list of predictions """ # whether to aggregate predictions across different samples (e.g. for QA on long texts) - # TODO remove or adjust after implmenting input objects properly + # TODO remove or adjust after implementing input objects properly # if set(dicts[0].keys()) == {"qas", "context"}: # warnings.warn("QA Input dictionaries with [qas, context] as keys will be deprecated in the future", # DeprecationWarning) diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index b988d98651..eb17d80e4c 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -568,7 +568,7 @@ def forward_lm(self, **kwargs): def log_params(self): """ - Logs parameteres to generic logger MlLogger + Logs parameters to generic logger MlLogger """ params = { "lm_type": self.language_model.__class__.__name__, diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 1def4ff4a0..0ebcf072e0 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -367,7 +367,7 @@ def forward_lm( def log_params(self): """ - Logs paramteres to generic logger MlLogger + Logs parameters to generic logger MlLogger """ params = { "lm1_type": self.language_model1.__class__.__name__, diff --git a/haystack/modeling/model/feature_extraction.py b/haystack/modeling/model/feature_extraction.py index 2ce31175ac..6b7508a84a 100644 --- a/haystack/modeling/model/feature_extraction.py +++ b/haystack/modeling/model/feature_extraction.py @@ -382,7 +382,7 @@ def _words_to_tokens( first_token = True for token in tokens_word: token_offsets.append(word_offset) - # Depending on the tokenizer type special chars are added to distinguish tokens with preceeding + # Depending on the tokenizer type special chars are added to distinguish tokens with preceding # whitespace (=> "start of a word"). We need to get rid of these to calculate the original length of the token original_token = re.sub(SPECIAL_TOKENIZER_CHARS, "", token) # Don't use length of unk token for offset calculation diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 82f3186fe2..eaa83febd7 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -153,7 +153,7 @@ def formatted_preds( """ Extracting vectors from a language model (for example, for extracting sentence embeddings). You can use different pooling strategies and layers by specifying them in the object attributes - `extraction_layer` and `extraction_strategy`. You should set both these attirbutes using the Inferencer: + `extraction_layer` and `extraction_strategy`. You should set both these attributes using the Inferencer: Example: Inferencer(extraction_strategy='cls_token', extraction_layer=-1) :param logits: Tuple of (sequence_output, pooled_output) from the language model. diff --git a/haystack/modeling/model/optimization.py b/haystack/modeling/model/optimization.py index 69d1a235a3..d3e55f7674 100644 --- a/haystack/modeling/model/optimization.py +++ b/haystack/modeling/model/optimization.py @@ -142,7 +142,7 @@ def initialize_optimizer( # Adjust for parallel training model, optimizer = optimize_model(model, device, local_rank, optimizer, distributed) - # Get learning rate schedule - moved below to supress warning + # Get learning rate schedule - moved below to suppress warning scheduler = get_scheduler(optimizer, schedule_opts) return model, optimizer, scheduler diff --git a/haystack/modeling/model/prediction_head.py b/haystack/modeling/model/prediction_head.py index d433c34b86..351e4045d0 100644 --- a/haystack/modeling/model/prediction_head.py +++ b/haystack/modeling/model/prediction_head.py @@ -864,7 +864,7 @@ def pred_to_doc_idxs(pred, passage_start_t): """ Converts the passage level predictions to document level predictions. Note that on the doc level we don't have special tokens or question tokens. This means that a no answer - cannot be prepresented by a (0,0) qa_answer but will instead be represented by (-1, -1) + cannot be represented by a (0,0) qa_answer but will instead be represented by (-1, -1) """ new_pred = [] for qa_answer in pred: @@ -891,7 +891,7 @@ def label_to_doc_idxs(label, passage_start_t): """ Converts the passage level labels to document level labels. Note that on the doc level we don't have special tokens or question tokens. This means that a no answer - cannot be prepresented by a (0,0) span but will instead be represented by (-1, -1) + cannot be represented by a (0,0) span but will instead be represented by (-1, -1) """ new_label = [] for start, end in label: diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 20f939257e..4e92b8ed90 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -391,7 +391,7 @@ def forward_lm(self, **kwargs): def log_params(self): """ - Logs paramteres to generic logger MlLogger + Logs parameters to generic logger MlLogger """ params = { "lm1_type": self.language_model1.__class__.__name__, diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index f99c1c3d46..1f7e0aee75 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -205,7 +205,7 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[ param_fields.pop(0) param_fields_kwargs: Dict[str, Any] = {} - # Read all the paramteres extracted from the __init__ method with type and default value + # Read all the parameters extracted from the __init__ method with type and default value for param in param_fields: annotation = Any if param.annotation != param.empty: diff --git a/haystack/nodes/audio/whisper_transcriber.py b/haystack/nodes/audio/whisper_transcriber.py index 29de05041f..1e95669cd3 100644 --- a/haystack/nodes/audio/whisper_transcriber.py +++ b/haystack/nodes/audio/whisper_transcriber.py @@ -52,7 +52,7 @@ def __init__( :param api_key: OpenAI API key. If None, a local installation of Whisper is used. :param model_name_or_path: Name of the model to use. If using a local installation of Whisper, set this to one of the following values: "tiny", "small", "medium", "large", "large-v2". If using - the API, set thsi value to: "whisper-1" (default). + the API, set this value to: "whisper-1" (default). :param device: Device to use for inference. Only used if you're using a local installation of Whisper. If None, the device is automatically selected. :param api_base: The OpenAI API Base url, defaults to `https://api.openai.com/v1`. diff --git a/haystack/nodes/base.py b/haystack/nodes/base.py index 0dce2a5f22..7ba07174dc 100644 --- a/haystack/nodes/base.py +++ b/haystack/nodes/base.py @@ -42,7 +42,7 @@ def wrapper_exportable_to_yaml(self, *args, **kwargs): for k, v in params.items(): self._component_config["params"][k] = v - # Call the actuall __init__ function with all the arguments + # Call the actual __init__ function with all the arguments init_func(self, *args, **kwargs) return wrapper_exportable_to_yaml diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index 3e66c1ab3b..73c303bfb3 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -95,7 +95,7 @@ def __init__( 3) ["--remote-debugging-port=9222"] This option enables remote debug over HTTP. See [Chromium Command Line Switches](https://peter.sh/experiments/chromium-command-line-switches/) for more details on the available options. - If your crawler fails, rasing a `selenium.WebDriverException`, this [Stack Overflow thread](https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t) can be helpful. Contains useful suggestions for webdriver_options. + If your crawler fails, raising a `selenium.WebDriverException`, this [Stack Overflow thread](https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t) can be helpful. Contains useful suggestions for webdriver_options. """ selenium_import.check() super().__init__() diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 607fc1374b..8dfadc3620 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -20,7 +20,7 @@ from haystack.utils.torch_utils import ListDataset -# supported models classes should be extended when HF image-to-text pipeline willl support more classes +# supported models classes should be extended when HF image-to-text pipeline will support more classes # see https://github.com/huggingface/transformers/issues/21110 SUPPORTED_MODELS_CLASSES = [ "VisionEncoderDecoderModel", diff --git a/haystack/nodes/prompt/invocation_layer/hugging_face.py b/haystack/nodes/prompt/invocation_layer/hugging_face.py index ce46eb9f85..a7666dca15 100644 --- a/haystack/nodes/prompt/invocation_layer/hugging_face.py +++ b/haystack/nodes/prompt/invocation_layer/hugging_face.py @@ -193,7 +193,7 @@ def _prepare_pipeline_kwargs(self, **kwargs) -> Dict[str, Any]: # For models not yet supported by the transformers library, we must set `trust_remote_code=True` within # the underlying pipeline to ensure the model's successful loading. However, this does not guarantee the # tokenizer will be loaded alongside. Therefore, we need to add additional logic here to manually load the - # tokenizer and pass it to transformers' pipleine. + # tokenizer and pass it to transformers' pipeline. # Otherwise, calling `self.pipe.tokenizer.model_max_length` will return an error. tokenizer = self._prepare_tokenizer(model, hub_kwargs, model_kwargs) diff --git a/haystack/nodes/prompt/invocation_layer/sagemaker_hf_text_gen.py b/haystack/nodes/prompt/invocation_layer/sagemaker_hf_text_gen.py index aeabaf44a3..73b4f0c96d 100644 --- a/haystack/nodes/prompt/invocation_layer/sagemaker_hf_text_gen.py +++ b/haystack/nodes/prompt/invocation_layer/sagemaker_hf_text_gen.py @@ -214,7 +214,7 @@ def get_test_payload(cls) -> Dict[str, Any]: As of June 23, Sagemaker endpoints support the JSON payload format from the https://github.com/huggingface/text-generation-inference project. At the time of writing this docstring, - only Falcon models were deployed using this format. See pyton client implementation from the + only Falcon models were deployed using this format. See python client implementation from the https://github.com/huggingface/text-generation-inference for more details. :return: A payload used for testing if the current endpoint is working. diff --git a/haystack/nodes/prompt/prompt_template.py b/haystack/nodes/prompt/prompt_template.py index 7717885be2..1d56ff6bfd 100644 --- a/haystack/nodes/prompt/prompt_template.py +++ b/haystack/nodes/prompt/prompt_template.py @@ -66,7 +66,7 @@ # # After some discussion we deemed the change to be too breaking for existing # use cases and which steps would have been necessary to migrate to the -# new API in case someone was using an harcoded template we decided to +# new API in case someone was using an hardcoded template we decided to # bring them back. # # So for the time being this must live here, no new template must be added diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 6b16745ef6..5302051928 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -130,7 +130,7 @@ def __init__( :param confidence_threshold: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default. :param proxies: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'} :param local_files_only: Whether to force checking for local files only (and forbid downloads) - :param force_download: Whether fo force a (re-)download even if the model exists locally in the cache. + :param force_download: Whether to force a (re-)download even if the model exists locally in the cache. :param use_auth_token: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index add4c86b35..b215a22f12 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -28,11 +28,11 @@ class PipelineStatus(Enum): UNDEPLOYMENT_SCHEDULED: str = "UNDEPLOYMENT_SCHEDULED" DEPLOYMENT_FAILED: str = "DEPLOYMENT_FAILED" UNDEPLOYMENT_FAILED: str = "UNDEPLOYMENT_FAILED" - UKNOWN: str = "UNKNOWN" + UNKNOWN: str = "UNKNOWN" @classmethod def from_str(cls, status_string: str) -> "PipelineStatus": - return cls.__dict__.get(status_string, PipelineStatus.UKNOWN) + return cls.__dict__.get(status_string, PipelineStatus.UNKNOWN) SATISFIED_STATES_KEY = "satisfied_states" @@ -1117,7 +1117,7 @@ def create_eval_run( :param pipeline_config_name: The name of the pipeline to evaluate. :param evaluation_set: The name of the evaluation set to use. :param eval_mode: The evaluation mode to use. - :param debug: Wheter to enable debug output. + :param debug: Whether to enable debug output. :param comment: Comment to add about to the evaluation run. :param tags: Tags to add to the evaluation run. :param headers: Headers to pass to the API call @@ -1219,7 +1219,7 @@ def update_eval_run( :param pipeline_config_name: The name of the pipeline to evaluate. :param evaluation_set: The name of the evaluation set to use. :param eval_mode: The evaluation mode to use. - :param debug: Wheter to enable debug output. + :param debug: Whether to enable debug output. :param comment: Comment to add about to the evaluation run. :param tags: Tags to add to the evaluation run. :param headers: Headers to pass to the API call @@ -1560,7 +1560,7 @@ def create_run( :param pipeline_config_name: The name of the pipeline to evaluate. Use `list_pipelines()` to list all available pipelines. :param evaluation_set: The name of the evaluation set to use. Use `list_evaluation_sets()` to list all available evaluation sets. :param eval_mode: The evaluation mode to use. - :param debug: Wheter to enable debug output. + :param debug: Whether to enable debug output. :param comment: Comment to add about to the evaluation run. :param tags: Tags to add to the evaluation run. :param api_key: Secret value of the API key. @@ -1603,7 +1603,7 @@ def update_run( :param pipeline_config_name: The name of the pipeline to evaluate. Use `list_pipelines()` to list all available pipelines. :param evaluation_set: The name of the evaluation set to use. Use `list_evaluation_sets()` to list all available evaluation sets. :param eval_mode: The evaluation mode to use. - :param debug: Wheter to enable debug output. + :param debug: Whether to enable debug output. :param comment: Comment to add about to the evaluation run. :param tags: Tags to add to the evaluation run. :param api_key: Secret value of the API key. @@ -1691,7 +1691,7 @@ def start_run( """ client = DeepsetCloud.get_eval_run_client(api_key=api_key, api_endpoint=api_endpoint, workspace=workspace) client.start_eval_run(eval_run_name=eval_run_name) - logger.info("You can check run progess by inspecting the `status` field returned from `get_run()`.") + logger.info("You can check run progress by inspecting the `status` field returned from `get_run()`.") @classmethod def create_and_start_run( @@ -1716,7 +1716,7 @@ def create_and_start_run( :param pipeline_config_name: The name of the pipeline to evaluate. Use `list_pipelines()` to list all available pipelines. :param evaluation_set: The name of the evaluation set to use. Use `list_evaluation_sets()` to list all available evaluation sets. :param eval_mode: The evaluation mode to use. - :param debug: Wheter to enable debug output. + :param debug: Whether to enable debug output. :param comment: Comment to add about to the evaluation run. :param tags: Tags to add to the evaluation run. :param api_key: Secret value of the API key. diff --git a/haystack/utils/squad_data.py b/haystack/utils/squad_data.py index 5b466cf88a..5e950d4d35 100644 --- a/haystack/utils/squad_data.py +++ b/haystack/utils/squad_data.py @@ -227,7 +227,7 @@ def _aggregate_questions(x): def _aggregate_answers(x): x = x[["answer_text", "answer_start"]] x = x.rename(columns={"answer_text": "text"}) - # Span anwser + # Span answer try: x["answer_start"] = x["answer_start"].astype(int) ret = x.to_dict("records") diff --git a/proposals/text/3875-table-cell.md b/proposals/text/3875-table-cell.md index b04d786d39..a014ded77e 100644 --- a/proposals/text/3875-table-cell.md +++ b/proposals/text/3875-table-cell.md @@ -193,7 +193,7 @@ for 2 additional versions of Haystack. ## What's the impact of not adding this feature? Requiring users to figure out how to interpret the linearized answer cell coordinates to reconstruct the row and column indices -to be able to access the answer cell in the returned tabel. +to be able to access the answer cell in the returned table. ## Other designs 1. Expand `Span` dataclass to have optional `col` and `row` fields. This would require a similar check as `TableCell`, but instead diff --git a/proposals/text/4284-drop-basecomponent.md b/proposals/text/4284-drop-basecomponent.md index 72e81bd4eb..8ed961b6cb 100644 --- a/proposals/text/4284-drop-basecomponent.md +++ b/proposals/text/4284-drop-basecomponent.md @@ -47,7 +47,7 @@ On top of these issues, there is the tangential issue of `DocumentStore`s and th - `DocumentStore`s are nodes in theory, but in practice they can be added to `Pipeline`s only to receive documents to be stored. On the other hand, `DocumentStore`'s most prominent usecase is as a _source_ of documents, and currently they are not suited for this task without going through an intermediary, most often a `Retriever` class. - The relationship between `DocumentStore` and `Retriever` should be left as a topic for a separate proposal but kept in mind, because `Retriever`s currently act as the main interface for `DocumentStore`s into `Pipeline`s. -This proposal tries to adress all the above point by taking a radical stance with: +This proposal tries to address all the above point by taking a radical stance with: - A full reimplementation of the `Pipeline` class that does not limit itself to DAGs, can run branches in parallel, can skip branches and can process loops safely. @@ -548,7 +548,7 @@ class MyNode: These values will be given to the `__init__` method of a new instance when the pipeline is deserialized. - The `__init__` must be extrememly lightweight, because it's a frequent + The `__init__` must be extremely lightweight, because it's a frequent operation during the construction and validation of the pipeline. If a node has some heavy state to initialize (models, backends, etc...) refer to the `warm_up()` method. @@ -932,7 +932,7 @@ There are a number of drawbacks about the proposed approach: ## Known limitations -- **Reusability of nodes across Pipelines in REST API.** Currently, REST API are designed in such a way that a separate worker is spawned for each pipeline deployed. That makes sharing node instances across them a non-starter. However, we believe this specific limitation can be adressed by a different approach to the problem, like splitting pipelines in a way that shared nodes are stored in a dedicated sub-pipeline and so on. We postpone addressing this problem when it arises, as we don't consider it blocking and workarounds can be found. +- **Reusability of nodes across Pipelines in REST API.** Currently, REST API are designed in such a way that a separate worker is spawned for each pipeline deployed. That makes sharing node instances across them a non-starter. However, we believe this specific limitation can be addressed by a different approach to the problem, like splitting pipelines in a way that shared nodes are stored in a dedicated sub-pipeline and so on. We postpone addressing this problem when it arises, as we don't consider it blocking and workarounds can be found. # Adoption strategy diff --git a/proposals/text/4370-documentstores-and-retrievers.md b/proposals/text/4370-documentstores-and-retrievers.md index ebd01e9e1a..bf2b985bc2 100644 --- a/proposals/text/4370-documentstores-and-retrievers.md +++ b/proposals/text/4370-documentstores-and-retrievers.md @@ -18,7 +18,7 @@ Note: these stores are designed to work **only** alongside Haystack 2.0 Pipeline Current `DocumentStore` face several issues mostly due to their organic growth. Some of them are: -- `DocumentStore`s perform the bulk of retrieval, but they need to be tighly coupled to a `Retriever` object to work. We believe this coupling can be broken by a clear API boundary between `DocumentStores`, `Retriever`s and `Embedder`s. In this PR we focus on decoupling them. +- `DocumentStore`s perform the bulk of retrieval, but they need to be tightly coupled to a `Retriever` object to work. We believe this coupling can be broken by a clear API boundary between `DocumentStores`, `Retriever`s and `Embedder`s. In this PR we focus on decoupling them. - `DocumentStore`s tend to bring in complex dependencies, so less used stores should be easy to decouple into external packages at need. diff --git a/proposals/text/5390-embedders.md b/proposals/text/5390-embedders.md index 8d9d2277f7..5314570f34 100644 --- a/proposals/text/5390-embedders.md +++ b/proposals/text/5390-embedders.md @@ -65,7 +65,7 @@ results = query_pipe.run(...) # Motivation The motivations behind this change were already provided in the previous proposals ([Embedding Retriever](3558-embedding_retriever.md) and [DocumentStores and Retrievers](4370-document_stores_and_retrievers.md)). Here is a summary: -- Retrievers should't be responsible for embedding Documents. +- Retrievers shouldn't be responsible for embedding Documents. - Currently, Retrievers have many parameters just to support and configure different underlying Encoders(≈Embedders). - Adding support for new embedding providers or strategies is difficult. It requires changing the Retriever code. diff --git a/pyproject.toml b/pyproject.toml index 72763c320c..c8b2177eff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -278,6 +278,11 @@ packages = [ line-length = 120 skip_magic_trailing_comma = true # For compatibility with pydoc>=4.6, check if still needed. +[tool.codespell] +ignore-words-list = "ans,astroid,nd,ned,nin,ue" +quiet-level = 3 +skip = "test/nodes/*,test/others/*,test/samples/*" + [tool.pylint.'MESSAGES CONTROL'] max-line-length=120 load-plugins = "haystack_linter" @@ -330,7 +335,7 @@ disable = [ "deprecated-method", ] [tool.pylint.'DESIGN'] -max-args = 37 # Default is 5 +max-args = 38 # Default is 5 max-attributes = 27 # Default is 7 max-branches = 34 # Default is 12 max-locals = 45 # Default is 15 diff --git a/releasenotes/notes/codespell-d4a32b9c589ca26e.yaml b/releasenotes/notes/codespell-d4a32b9c589ca26e.yaml new file mode 100644 index 0000000000..9fb4efa0dd --- /dev/null +++ b/releasenotes/notes/codespell-d4a32b9c589ca26e.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + ci: Fix typos discovered by codespell running in pre-commit. diff --git a/rest_api/rest_api/controller/feedback.py b/rest_api/rest_api/controller/feedback.py index ff02ea701b..59dc357bf4 100644 --- a/rest_api/rest_api/controller/feedback.py +++ b/rest_api/rest_api/controller/feedback.py @@ -46,7 +46,7 @@ def get_feedback(index: Optional[str] = None): @router.delete("/feedback") def delete_feedback(index: Optional[str] = None): """ - This endpoint allows the API user to delete all the feedback that has been sumbitted through the + This endpoint allows the API user to delete all the feedback that has been submitted through the `POST /feedback` endpoint. """ all_labels = document_store.get_all_labels(index=index) diff --git a/test/conftest.py b/test/conftest.py index aec71800ea..77823ca2eb 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -93,7 +93,7 @@ def fail_at_version(target_major, target_minor): of target_major and/or target_minor. If the current version has `rc0` set the test won't fail but only issue a warning, this is done because we use `rc0` to mark the development version in `main`. If we wouldn't - do this tests would continuosly fail in main. + do this tests would continuously fail in main. ```python from ..conftest import fail_at_version diff --git a/test/document_stores/test_opensearch.py b/test/document_stores/test_opensearch.py index fb55c24173..ca4d4fc9b5 100644 --- a/test/document_stores/test_opensearch.py +++ b/test/document_stores/test_opensearch.py @@ -1248,9 +1248,9 @@ def test_bulk_write_retries_for_always_failing_insert_is_canceled(self, mocked_d with pytest.raises(DocumentStoreError, match="Last try of bulk indexing documents failed."): mocked_document_store._bulk(documents=docs_to_write, _timeout=0, _remaining_tries=3) - assert mocked_bulk.call_count == 3 # depth first search failes and cancels the whole bulk request + assert mocked_bulk.call_count == 3 # depth first search fails and cancels the whole bulk request - assert "Too Many Requeset" in caplog.text + assert "Too Many Requests" in caplog.text assert " Splitting the number of documents into two chunks with the same size" in caplog.text @pytest.mark.unit diff --git a/test/prompt/test_prompt_node.py b/test/prompt/test_prompt_node.py index c305b799b2..376ab9ca42 100644 --- a/test/prompt/test_prompt_node.py +++ b/test/prompt/test_prompt_node.py @@ -484,7 +484,7 @@ def test_prompt_node_no_debug(prompt_model): pipe = Pipeline() pipe.add_node(component=node, name="prompt_node", inputs=["Query"]) - # debug explicitely False + # debug explicitly False result = pipe.run(query="not relevant", documents=[Document("Berlin is the capital of Germany")], debug=False) assert result.get("_debug", "No debug info") == "No debug info" diff --git a/test/samples/markdown/sample.md b/test/samples/markdown/sample.md index 97ad812c85..d39e32d44e 100644 --- a/test/samples/markdown/sample.md +++ b/test/samples/markdown/sample.md @@ -20,7 +20,7 @@ pip install farm-haystack ## Core Features - **Latest models**: Utilize all latest transformer based models (e.g. BERT, RoBERTa, MiniLM) for extractive QA, generative QA and document retrieval. -- **Modular**: Multiple choices to fit your tech stack and use case. Pick your favorite database, file converter or modeling framwework. +- **Modular**: Multiple choices to fit your tech stack and use case. Pick your favorite database, file converter or modeling framework. - **Open**: 100% compatible with HuggingFace's model hub. Tight interfaces to other frameworks (e.g. Transformers, FARM, sentence-transformers) - **Scalable**: Scale to millions of docs via retrievers, production-ready backends like Elasticsearch / FAISS and a fastAPI REST API - **End-to-End**: All tooling in one place: file conversion, cleaning, splitting, training, eval, inference, labeling ...