Skip to content

Commit

Permalink
ci: Fix typos discovered by codespell (#5778)
Browse files Browse the repository at this point in the history
* Fix typos discovered by codespell

* pylint: max-args = 38
  • Loading branch information
cclauss authored Sep 13, 2023
1 parent 30ca042 commit 6dd52d9
Show file tree
Hide file tree
Showing 44 changed files with 82 additions and 66 deletions.
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ repos:
hooks:
- id: black-jupyter

- repo: https://github.com/codespell-project/codespell
rev: v2.2.5
hooks:
- id: codespell
additional_dependencies:
- tomli

- repo: https://github.com/rhysd/actionlint
rev: v1.6.25
hooks:
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ ENV PATH="/opt/venv/bin:$PATH"
RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"

# Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences.
# We cache these models for seemless user experience.
# We cache these models for seamless user experience.
RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()"
2 changes: 1 addition & 1 deletion docs/pydoc/config/base-document-classifier.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ processors:
- type: crossref
renderer:
type: renderers.ReadmeRenderer
excerpt: Abstract class for the Document Classifer.
excerpt: Abstract class for the Document Classifier.
category_slug: haystack-classes
title: Base Document Classifier API
slug: base-document-classifier-api
Expand Down
2 changes: 1 addition & 1 deletion e2e/document_stores/test_update_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def test_update_embeddings_table_text_retriever(tmp_path):
assert doc.meta["meta_field"] == "value_table_0"
np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4)

# Check if Documents wih different content (text) get different embedding
# Check if Documents with different content (text) get different embedding
documents = ds.get_all_documents(
filters={"meta_field": ["value_text_1", "value_text_2"]}, return_embedding=True
)
Expand Down
2 changes: 1 addition & 1 deletion haystack/agents/conversational.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,6 @@ def __init__(
def add_tool(self, tool: Tool):
if len(self.tm.tools) == 0:
raise AgentError(
"You cannot add tools after initializing the ConversationalAgent without any tools. If you want to add tools, reinitailize the ConversationalAgent and provide `tools`."
"You cannot add tools after initializing the ConversationalAgent without any tools. If you want to add tools, reinitialize the ConversationalAgent and provide `tools`."
)
return super().add_tool(tool)
4 changes: 2 additions & 2 deletions haystack/document_stores/elasticsearch/es7.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ def __init__(
:param port: port(s) of elasticsearch nodes
:param username: username (standard authentication via http_auth)
:param password: password (standard authentication via http_auth)
:param api_key_id: ID of the API key (altenative authentication mode to the above http_auth)
:param api_key: Secret value of the API key (altenative authentication mode to the above http_auth)
:param api_key_id: ID of the API key (alternative authentication mode to the above http_auth)
:param api_key: Secret value of the API key (alternative authentication mode to the above http_auth)
:param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package)
:param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one.
:param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one.
Expand Down
4 changes: 2 additions & 2 deletions haystack/document_stores/elasticsearch/es8.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def __init__(
:param port: port(s) of elasticsearch nodes
:param username: username (standard authentication via http_auth)
:param password: password (standard authentication via http_auth)
:param api_key_id: ID of the API key (altenative authentication mode to the above http_auth)
:param api_key: Secret value of the API key (altenative authentication mode to the above http_auth)
:param api_key_id: ID of the API key (alternative authentication mode to the above http_auth)
:param api_key: Secret value of the API key (alternative authentication mode to the above http_auth)
:param aws4auth: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package)
:param index: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one.
:param label_index: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one.
Expand Down
8 changes: 4 additions & 4 deletions haystack/document_stores/es_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def open_search_index_to_document_store(
:param port: Ports(s) of OpenSearch nodes.
:param username: Username (standard authentication via http_auth).
:param password: Password (standard authentication via http_auth).
:param api_key_id: ID of the API key (altenative authentication mode to the above http_auth).
:param api_key: Secret value of the API key (altenative authentication mode to the above http_auth).
:param api_key_id: ID of the API key (alternative authentication mode to the above http_auth).
:param api_key: Secret value of the API key (alternative authentication mode to the above http_auth).
:param aws4auth: Authentication for usage with AWS OpenSearch
(can be generated with the requests-aws4auth package).
:param scheme: `"https"` or `"http"`, protocol used to connect to your OpenSearch instance.
Expand Down Expand Up @@ -171,8 +171,8 @@ def elasticsearch_index_to_document_store(
:param port: Ports(s) of Elasticsearch nodes.
:param username: Username (standard authentication via http_auth).
:param password: Password (standard authentication via http_auth).
:param api_key_id: ID of the API key (altenative authentication mode to the above http_auth).
:param api_key: Secret value of the API key (altenative authentication mode to the above http_auth).
:param api_key_id: ID of the API key (alternative authentication mode to the above http_auth).
:param api_key: Secret value of the API key (alternative authentication mode to the above http_auth).
:param aws4auth: Authentication for usage with AWS Elasticsearch
(can be generated with the requests-aws4auth package).
:param scheme: `"https"` or `"http"`, protocol used to connect to your Elasticsearch instance.
Expand Down
2 changes: 1 addition & 1 deletion haystack/document_stores/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __init__(
exists.
:param use_gpu: Whether to use a GPU or the CPU for calculating embedding similarity.
Falls back to CPU if no GPU is available.
:param scoring_batch_size: Batch size of documents to calculate similarity for. Very small batch sizes are inefficent.
:param scoring_batch_size: Batch size of documents to calculate similarity for. Very small batch sizes are inefficient.
Very large batch sizes can overrun GPU memory. In general you want to make sure
you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory.
Since the data is originally stored in CPU memory there is little risk of overruning memory
Expand Down
8 changes: 4 additions & 4 deletions haystack/document_stores/opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def __init__(
:param port: port(s) of OpenSearch nodes
:param username: username (standard authentication via http_auth)
:param password: password (standard authentication via http_auth)
:param api_key_id: ID of the API key (altenative authentication mode to the above http_auth)
:param api_key: Secret value of the API key (altenative authentication mode to the above http_auth)
:param api_key_id: ID of the API key (alternative authentication mode to the above http_auth)
:param api_key: Secret value of the API key (alternative authentication mode to the above http_auth)
:param aws4auth: Authentication for usage with AWS OpenSearch Service (can be generated with the requests-aws4auth package)
:param index: Name of index in OpenSearch to use for storing the documents that we want to search. If not existing yet, we will create one.
:param label_index: Name of index in OpenSearch to use for storing labels. If not existing yet, we will create one.
Expand Down Expand Up @@ -1299,7 +1299,7 @@ def _get_raw_similarity_score(self, score):
# adjust scores according to https://opensearch.org/docs/latest/search-plugins/knn/approximate-knn
# and https://opensearch.org/docs/latest/search-plugins/knn/knn-score-script/

# space type is required as criterion as there is no consistent similarity-to-space-type mapping accross knn engines
# space type is required as criterion as there is no consistent similarity-to-space-type mapping across knn engines
if self.space_type == "innerproduct":
if score > 1:
score = score - 1
Expand Down Expand Up @@ -1419,7 +1419,7 @@ def _train_ivf_index(

def _recommended_ivf_train_size(self) -> int:
"""
Calculates the minumum recommended number of training samples for IVF training as suggested in FAISS docs.
Calculates the minimum recommended number of training samples for IVF training as suggested in FAISS docs.
https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids
"""
min_points_per_cluster = 39
Expand Down
2 changes: 1 addition & 1 deletion haystack/document_stores/search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def _bulk(
except Exception as e:
if hasattr(e, "status_code") and e.status_code == 429: # type: ignore
logger.warning(
"Failed to insert a batch of '%s' documents because of a 'Too Many Requeset' response. "
"Failed to insert a batch of '%s' documents because of a 'Too Many Requests' response. "
"Splitting the number of documents into two chunks with the same size and retrying in %s seconds.",
len(documents),
_timeout,
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/data_handler/data_silo.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def _get_dataset(self, filename: Optional[Union[str, Path]], dicts: Optional[Lis
# loading dicts from file (default)
if dicts is None:
dicts = list(self.processor.file_to_dicts(filename)) # type: ignore
# shuffle list of dicts here if we later want to have a random dev set splitted from train set
# shuffle list of dicts here if we later want to have a random dev set split from train set
if str(self.processor.train_filename) in str(filename):
if not self.processor.dev_filename:
if self.processor.dev_split > 0.0:
Expand Down
12 changes: 6 additions & 6 deletions haystack/modeling/data_handler/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def __init__(
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
If not available the dataset will be loaded automatically
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
Expand Down Expand Up @@ -575,7 +575,7 @@ def _split_docs_into_passages(self, baskets: List[SampleBasket]):
)
except Exception as e:
logger.warning(
"Could not devide document into passages. Document: %s\nWith error: %s",
"Could not divide document into passages. Document: %s\nWith error: %s",
basket.raw["document_text"][:200],
e,
)
Expand Down Expand Up @@ -880,7 +880,7 @@ def __init__(
:param max_seq_len_query: Query samples are truncated after this many tokens.
:param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens.
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
If not available the dataset will be loaded automatically
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
Expand Down Expand Up @@ -1038,7 +1038,7 @@ def dataset_from_dicts(
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
baskets = self._fill_baskets(dicts, indices)

# Separat conversion of query
# Separate conversion of query
baskets = self._convert_queries(baskets=baskets)

# and context passages. When converting the context the label is also assigned.
Expand Down Expand Up @@ -1799,7 +1799,7 @@ def __init__(
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
If not available the dataset will be loaded automatically
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/main/farm/data_handler/utils.py>`_.
Expand Down Expand Up @@ -2151,7 +2151,7 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None
dev_labels[q["id"]] = q["answers"][0]["text"]
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
if len(not_included) > 0:
logger.info("There were missing predicitons for question ids: %s", list(not_included))
logger.info("There were missing predictions for question ids: %s", list(not_included))
for x in not_included:
predictions_json[x] = ""

Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/data_handler/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def offset_to_token_idx_vecorized(token_offsets, ch_idx):
if ch_idx >= np.max(token_offsets):
# idx must be including
idx = np.argmax(token_offsets)
# looking for the first occurence of token_offsets larger than ch_idx and taking one position to the left.
# looking for the first occurrence of token_offsets larger than ch_idx and taking one position to the left.
# This is needed to overcome n special_tokens at start of sequence
# and failsafe matching (the character start might not always coincide with a token offset, e.g. when starting at whitespace)
else:
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def inference_from_dicts(
:return: list of predictions
"""
# whether to aggregate predictions across different samples (e.g. for QA on long texts)
# TODO remove or adjust after implmenting input objects properly
# TODO remove or adjust after implementing input objects properly
# if set(dicts[0].keys()) == {"qas", "context"}:
# warnings.warn("QA Input dictionaries with [qas, context] as keys will be deprecated in the future",
# DeprecationWarning)
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/adaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ def forward_lm(self, **kwargs):

def log_params(self):
"""
Logs parameteres to generic logger MlLogger
Logs parameters to generic logger MlLogger
"""
params = {
"lm_type": self.language_model.__class__.__name__,
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/biadaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def forward_lm(

def log_params(self):
"""
Logs paramteres to generic logger MlLogger
Logs parameters to generic logger MlLogger
"""
params = {
"lm1_type": self.language_model1.__class__.__name__,
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ def _words_to_tokens(
first_token = True
for token in tokens_word:
token_offsets.append(word_offset)
# Depending on the tokenizer type special chars are added to distinguish tokens with preceeding
# Depending on the tokenizer type special chars are added to distinguish tokens with preceding
# whitespace (=> "start of a word"). We need to get rid of these to calculate the original length of the token
original_token = re.sub(SPECIAL_TOKENIZER_CHARS, "", token)
# Don't use length of unk token for offset calculation
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def formatted_preds(
"""
Extracting vectors from a language model (for example, for extracting sentence embeddings).
You can use different pooling strategies and layers by specifying them in the object attributes
`extraction_layer` and `extraction_strategy`. You should set both these attirbutes using the Inferencer:
`extraction_layer` and `extraction_strategy`. You should set both these attributes using the Inferencer:
Example: Inferencer(extraction_strategy='cls_token', extraction_layer=-1)
:param logits: Tuple of (sequence_output, pooled_output) from the language model.
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def initialize_optimizer(
# Adjust for parallel training
model, optimizer = optimize_model(model, device, local_rank, optimizer, distributed)

# Get learning rate schedule - moved below to supress warning
# Get learning rate schedule - moved below to suppress warning
scheduler = get_scheduler(optimizer, schedule_opts)

return model, optimizer, scheduler
Expand Down
4 changes: 2 additions & 2 deletions haystack/modeling/model/prediction_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,7 @@ def pred_to_doc_idxs(pred, passage_start_t):
"""
Converts the passage level predictions to document level predictions. Note that on the doc level we
don't have special tokens or question tokens. This means that a no answer
cannot be prepresented by a (0,0) qa_answer but will instead be represented by (-1, -1)
cannot be represented by a (0,0) qa_answer but will instead be represented by (-1, -1)
"""
new_pred = []
for qa_answer in pred:
Expand All @@ -891,7 +891,7 @@ def label_to_doc_idxs(label, passage_start_t):
"""
Converts the passage level labels to document level labels. Note that on the doc level we
don't have special tokens or question tokens. This means that a no answer
cannot be prepresented by a (0,0) span but will instead be represented by (-1, -1)
cannot be represented by a (0,0) span but will instead be represented by (-1, -1)
"""
new_label = []
for start, end in label:
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/triadaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def forward_lm(self, **kwargs):

def log_params(self):
"""
Logs paramteres to generic logger MlLogger
Logs parameters to generic logger MlLogger
"""
params = {
"lm1_type": self.language_model1.__class__.__name__,
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
param_fields.pop(0)
param_fields_kwargs: Dict[str, Any] = {}

# Read all the paramteres extracted from the __init__ method with type and default value
# Read all the parameters extracted from the __init__ method with type and default value
for param in param_fields:
annotation = Any
if param.annotation != param.empty:
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/audio/whisper_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(
:param api_key: OpenAI API key. If None, a local installation of Whisper is used.
:param model_name_or_path: Name of the model to use. If using a local installation of Whisper, set this to one of the following values: "tiny", "small", "medium", "large", "large-v2". If using
the API, set thsi value to: "whisper-1" (default).
the API, set this value to: "whisper-1" (default).
:param device: Device to use for inference. Only used if you're using a local
installation of Whisper. If None, the device is automatically selected.
:param api_base: The OpenAI API Base url, defaults to `https://api.openai.com/v1`.
Expand Down
Loading

0 comments on commit 6dd52d9

Please sign in to comment.