Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Python performance improvements with ruff C4 and PERF fixes #5803

Merged
merged 6 commits into from
Sep 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ repos:
- id: black-jupyter

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.289
rev: v0.0.290
hooks:
- id: ruff
- id: ruff

- repo: https://github.com/codespell-project/codespell
rev: v2.2.5
Expand Down
10 changes: 5 additions & 5 deletions e2e/modeling/test_dpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa

# generate embeddings with model loaded from model hub
dataset, tensor_names, _, __ = processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
dicts=[d], indices=list(range(len([d]))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand Down Expand Up @@ -811,7 +811,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa

# generate embeddings with model loaded from disk
dataset2, tensor_names2, _, __ = loaded_processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
dicts=[d], indices=list(range(len([d]))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand All @@ -820,7 +820,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa
all_embeddings2: Dict[str, Any] = {"query": [], "passages": []}
loaded_model.eval()

for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)):
for batch in tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True):
batch = {key: batch[key].to(device) for key in batch}

# get logits
Expand Down Expand Up @@ -904,7 +904,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa

# generate embeddings with model loaded from disk that originated from a FARM style model that was saved to disk earlier
dataset3, tensor_names3, _, __ = loaded_processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
dicts=[d], indices=list(range(len([d]))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand All @@ -913,7 +913,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa
all_embeddings3: Dict[str, Any] = {"query": [], "passages": []}
loaded_model.eval()

for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)):
for batch in tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True):
batch = {key: batch[key].to(device) for key in batch}

# get logits
Expand Down
2 changes: 1 addition & 1 deletion haystack/document_stores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ def _drop_duplicate_documents(self, documents: List[Document], index: Optional[s
:param index: name of the index
:return: A list of Haystack Document objects.
"""
_hash_ids: Set = set([])
_hash_ids: Set = set()
_documents: List[Document] = []

for document in documents:
Expand Down
6 changes: 3 additions & 3 deletions haystack/document_stores/opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,11 +1215,11 @@ def _get_embedding_field_mapping(
def _ivf_model_exists(self, index: str) -> bool:
if self._index_exists(".opensearch-knn-models"):
response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search")
existing_ivf_models = set(
existing_ivf_models = {
model["_source"]["model_id"]
for model in response["hits"]["hits"]
if model["_source"]["state"] != "failed"
)
}
else:
existing_ivf_models = set()

Expand Down Expand Up @@ -1461,7 +1461,7 @@ def _delete_ivf_model(self, index: str):
"""
if self._index_exists(".opensearch-knn-models"):
response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search")
existing_ivf_models = set(model["_source"]["model_id"] for model in response["hits"]["hits"])
existing_ivf_models = {model["_source"]["model_id"] for model in response["hits"]["hits"]}
if f"{index}-ivf" in existing_ivf_models:
self.client.transport.perform_request("DELETE", f"/_plugins/_knn/models/{index}-ivf")

Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/data_handler/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def flatten_rename(
assert any(key in encoded_batch for key in keys), f"one of the keys {keys} is not in batch {encoded_batch.keys()}"
features_flat = []
for item in range(len(encoded_batch[keys[0]])):
feat_dict = {k: v for k, v in zip(renamed_keys, [encoded_batch[k][item] for k in keys])}
feat_dict = dict(zip(renamed_keys, [encoded_batch[k][item] for k in keys]))
features_flat.append(feat_dict)
return features_flat

Expand Down
14 changes: 7 additions & 7 deletions haystack/modeling/data_handler/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def load_from_dir(cls, load_dir: str):
config = json.load(f)
config["inference"] = True
# init tokenizer
if "lower_case" in config.keys():
if "lower_case" in config:
logger.warning(
"Loading tokenizer from deprecated config. "
"If you used `custom_vocab` or `never_split_chars`, this won't work anymore."
Expand Down Expand Up @@ -1249,7 +1249,7 @@ def _combine_title_context(titles: List[str], texts: List[str]):
"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '%s' ",
ctx,
)
res.append(tuple((title, ctx)))
res.append((title, ctx))
return res


Expand Down Expand Up @@ -1762,7 +1762,7 @@ def _combine_meta_context(meta_fields: List[str], texts: List[str]):
for meta, ctx in zip(meta_fields, texts):
if meta is None:
meta = ""
res.append(tuple((meta, ctx)))
res.append((meta, ctx))
return res


Expand Down Expand Up @@ -2111,12 +2111,12 @@ def dataset_from_dicts(
truncation=True,
max_length=self.max_seq_len,
)
names = [key for key in tokens]
names = list(tokens)
inputs = [tokens[key] for key in tokens]
if not "padding_mask" in names:
if "padding_mask" not in names:
index = names.index("attention_mask")
names[index] = "padding_mask"
if not "segment_ids" in names:
if "segment_ids" not in names:
index = names.index("token_type_ids")
names[index] = "segment_ids"

Expand Down Expand Up @@ -2149,7 +2149,7 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None
dev_labels[q["id"]] = "is_impossible"
else:
dev_labels[q["id"]] = q["answers"][0]["text"]
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
not_included = dev_labels.keys() - predictions_json.keys()
if len(not_included) > 0:
logger.info("There were missing predictions for question ids: %s", list(not_included))
for x in not_included:
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/biadaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def forward(
output2 = None

embedding1, embedding2 = head(output1, output2)
all_logits.append(tuple([embedding1, embedding2]))
all_logits.append((embedding1, embedding2))
else:
# just return LM output (e.g. useful for extracting embeddings at inference time)
all_logits.append((pooled_output))
Expand Down
8 changes: 4 additions & 4 deletions haystack/modeling/model/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,14 @@ def output_dims(self):
if self._output_dims:
return self._output_dims

for odn in OUTPUT_DIM_NAMES:
try:
try:
for odn in OUTPUT_DIM_NAMES:
value = getattr(self.model.config, odn, None)
if value:
self._output_dims = value
return value
except AttributeError:
raise ModelingError("Can't get the output dimension before loading the model.")
except AttributeError:
raise ModelingError("Can't get the output dimension before loading the model.")

raise ModelingError("Could not infer the output dimensions of the language model.")

Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/prediction_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,7 @@ def aggregate_preds(self, preds, passage_start_t, ids, seq_2_start_t=None, label
all_basket_labels = {k: self.reduce_labels(v) for k, v in all_basket_labels.items()}

# Return aggregated predictions in order as a list of lists
keys = [k for k in all_basket_preds]
keys = list(all_basket_preds)
aggregated_preds = [all_basket_preds[k] for k in keys]
if labels:
labels = [all_basket_labels[k] for k in keys]
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/triadaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def forward(self, **kwargs):
output2 = None

embedding1, embedding2 = head(output1, output2)
all_logits.append(tuple([embedding1, embedding2]))
all_logits.append((embedding1, embedding2))
else:
# just return LM output (e.g. useful for extracting embeddings at inference time)
all_logits.append((pooled_output))
Expand Down
4 changes: 2 additions & 2 deletions haystack/nodes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def name(self, value: str):
@property
def utilized_components(self) -> List[BaseComponent]:
if "params" not in self._component_config:
return list()
return []
return [param for param in self._component_config["params"].values() if isinstance(param, BaseComponent)]

@property
Expand Down Expand Up @@ -229,7 +229,7 @@ def _dispatch_run_general(self, run_method: Callable, **kwargs):
if "debug" in value.keys():
self.debug = value.pop("debug")

for _k, _v in value.items():
for _k in value.keys():
if _k not in run_signature_args:
raise Exception(f"Invalid parameter '{_k}' for the node '{self.name}'.")

Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/document_classifier/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def predict(self, documents: List[Document], batch_size: Optional[int] = None) -
formatted_prediction = {
"label": prediction["labels"][0],
"score": prediction["scores"][0],
"details": {label: score for label, score in zip(prediction["labels"], prediction["scores"])},
"details": dict(zip(prediction["labels"], prediction["scores"])),
}
elif self.task == "text-classification":
formatted_prediction = {
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/file_converter/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def _convert_tables_and_text(
if not isinstance(table.content, pd.DataFrame):
raise HaystackError("Document's content field must be of type 'pd.DataFrame'.")
for _, row in table.content.iterrows():
for _, cell in row.items():
for cell in row.values():
file_text += f" {cell}"
if not self.validate_language(file_text, valid_languages):
logger.warning(
Expand Down
5 changes: 2 additions & 3 deletions haystack/nodes/file_converter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,15 +202,14 @@ def run( # type: ignore
for file_path, file_meta in tqdm(
zip(file_paths, meta), total=len(file_paths), disable=not self.progress_bar, desc="Converting files"
):
for doc in self.convert(
documents += self.convert(
file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
encoding=encoding,
id_hash_keys=id_hash_keys,
):
documents.append(doc)
)

# Cleanup ligatures
for document in documents:
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/file_converter/parsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def convert(
if not isinstance(table.content, pd.DataFrame):
raise HaystackError("Document's content field must be of type 'pd.DataFrame'.")
for _, row in table.content.iterrows():
for _, cell in row.items():
for cell in row.values():
file_text += f" {cell}"
if not self.validate_language(file_text, valid_languages):
logger.warning(
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/file_converter/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def _read_pdf(
document += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"
else:
cpu = cpu_count() if isinstance(multiprocessing, bool) else multiprocessing
page_list = [i for i in range(start_page, end_page)]
page_list = list(range(start_page, end_page))
cpu = cpu if len(page_list) > cpu else len(page_list)
parts = divide(cpu, page_list)
pages_mp = [(i, file_path, parts, sort_by_position, ocr, ocr_language) for i in range(cpu)]
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/other/join_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None):
"score would be `-infinity`."
)
else:
sorted_docs = [(k, v) for k, v in scores_map.items()]
sorted_docs = list(scores_map.items())

if not top_k_join:
top_k_join = self.top_k_join
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/query_classifier/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(

self.labels = labels
if task == "text-classification":
labels_from_model = [label for label in self.model.model.config.id2label.values()]
labels_from_model = list(self.model.model.config.id2label.values())
if set(labels) != set(labels_from_model):
raise ValueError(
f"For text-classification, the provided labels must match the model labels; only the order can differ.\n"
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/ranker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _add_meta_fields_to_docs(
for key in embed_meta_fields:
if key in doc.meta and doc.meta[key]:
if isinstance(doc.meta[key], list):
meta_data_fields.extend([item for item in doc.meta[key]])
meta_data_fields.extend(list(doc.meta[key]))
else:
meta_data_fields.append(doc.meta[key])
# Convert to type string (e.g. for ints or floats)
Expand Down
4 changes: 2 additions & 2 deletions haystack/nodes/reader/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1153,10 +1153,10 @@ def eval(
}

# Get rid of the question key again (after we aggregated we don't need it anymore)
d[str(doc_id)]["qas"] = [v for v in aggregated_per_question.values()]
d[str(doc_id)]["qas"] = list(aggregated_per_question.values())

# Convert input format for FARM
farm_input = [v for v in d.values()]
farm_input = list(d.values())
n_queries = len([y for x in farm_input for y in x["qas"]])

# Create DataLoader that can be passed to the Evaluator
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/retriever/_embedding_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def embed(self, model: str, text: List[str]) -> np.ndarray:
raise CohereUnauthorizedError(f"Invalid Cohere API key. {response.text}")
if response.status_code != 200:
raise CohereError(response.text, status_code=response.status_code)
generated_embeddings = [e for e in res["embeddings"]]
generated_embeddings = list(res["embeddings"])
return np.array(generated_embeddings)

def embed_batch(self, text: List[str]) -> np.ndarray:
Expand Down
6 changes: 3 additions & 3 deletions haystack/nodes/retriever/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def _get_predictions(self, dicts: List[Dict[str, Any]]) -> Dict[str, np.ndarray]
:return: dictionary of embeddings for "passages" and "query"
"""
dataset, tensor_names, _, _ = self.processor.dataset_from_dicts(
dicts, indices=[i for i in range(len(dicts))], return_baskets=True
dicts, indices=list(range(len(dicts))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand Down Expand Up @@ -1113,7 +1113,7 @@ def _get_predictions(self, dicts: List[Dict[str, Any]]) -> Dict[str, np.ndarray]
"""

dataset, tensor_names, _, _ = self.processor.dataset_from_dicts(
dicts, indices=[i for i in range(len(dicts))], return_baskets=True
dicts, indices=list(range(len(dicts))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand Down Expand Up @@ -1862,7 +1862,7 @@ def _preprocess_documents(self, docs: List[Document]) -> List[Document]:
for key in self.embed_meta_fields:
if key in doc.meta and doc.meta[key]:
if isinstance(doc.meta[key], list):
meta_data_fields.extend([item for item in doc.meta[key]])
meta_data_fields.extend(list(doc.meta[key]))
else:
meta_data_fields.append(doc.meta[key])
# Convert to type string (e.g. for ints or floats)
Expand Down
5 changes: 1 addition & 4 deletions haystack/nodes/retriever/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,10 +457,7 @@ def _get_all_paragraphs(self, document_store: BaseDocumentStore, index: Optional
def _calc_scores(self, queries: List[str], index: str) -> List[Dict[int, float]]:
question_vector = self.vectorizer.transform(queries)
doc_scores_per_query = self.tfidf_matrices[index].dot(question_vector.T).T.toarray()
doc_scores_per_query = [
[(doc_idx, doc_score) for doc_idx, doc_score in enumerate(doc_scores)]
for doc_scores in doc_scores_per_query
]
doc_scores_per_query = [list(enumerate(doc_scores)) for doc_scores in doc_scores_per_query]
indices_and_scores: List[Dict] = [
OrderedDict(sorted(query_idx_scores, key=lambda tup: tup[1], reverse=True))
for query_idx_scores in doc_scores_per_query
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/translator/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def translate(
return translated_texts[0]
elif documents:
if isinstance(documents, list) and isinstance(documents[0], str):
return [translated_text for translated_text in translated_texts]
return list(translated_texts)

translated_documents: Union[
List[Document], List[Answer], List[str], List[Dict[str, Any]]
Expand Down
8 changes: 4 additions & 4 deletions haystack/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def save_to_deepset_cloud(
index_config = index_pipeline.get_config()
pipelines = query_config["pipelines"] + index_config["pipelines"]
all_components = query_config["components"] + index_config["components"]
distinct_components = [c for c in {component["name"]: component for component in all_components}.values()]
distinct_components = list({component["name"]: component for component in all_components}.values())
document_stores = [c for c in distinct_components if c["type"].endswith("DocumentStore")]
for document_store in document_stores:
if document_store["type"] != "DeepsetCloudDocumentStore":
Expand Down Expand Up @@ -827,10 +827,10 @@ def eval_beir(
logger.info("Cropping dataset from %s to %s documents", len(corpus), num_documents)
corpus = dict(itertools.islice(corpus.items(), num_documents))
# Remove queries that don't contain the remaining documents
corpus_ids = set(list(corpus.keys()))
corpus_ids = set(corpus.keys())
qrels_new = {}
for query_id, document_rel_dict in qrels.items():
document_rel_ids_intersection = list(corpus_ids & set(list(document_rel_dict.keys())))
document_rel_ids_intersection = list(corpus_ids & set(document_rel_dict.keys()))
# If there are no remaining documents related to the query, delete the query
if len(document_rel_ids_intersection) == 0:
del queries[query_id]
Expand Down Expand Up @@ -1957,7 +1957,7 @@ def get_document_store(self) -> Optional[BaseDocumentStore]:
matches = self.get_nodes_by_class(class_type=BaseDocumentStore)
if len(matches) == 0:
matches = list(
set(retriever.document_store for retriever in self.get_nodes_by_class(class_type=BaseRetriever))
{retriever.document_store for retriever in self.get_nodes_by_class(class_type=BaseRetriever)}
)

if len(matches) > 1:
Expand Down
Loading