From 4d1aa39522b96a6dfe5b59d597e8b376f40dc7e6 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 13 Sep 2023 19:51:27 +0200 Subject: [PATCH 1/6] Python performance improvements with ruff C4 and PERF --- e2e/modeling/test_dpr.py | 10 +++++----- examples/basic_qa_pipeline.py | 15 ++++++++------- haystack/document_stores/base.py | 2 +- haystack/document_stores/opensearch.py | 6 +++--- haystack/modeling/data_handler/dataset.py | 2 +- haystack/modeling/data_handler/processor.py | 14 +++++++------- haystack/modeling/model/biadaptive_model.py | 2 +- haystack/modeling/model/language_model.py | 8 ++++---- haystack/modeling/model/prediction_head.py | 2 +- haystack/modeling/model/triadaptive_model.py | 2 +- haystack/nodes/base.py | 4 ++-- .../nodes/document_classifier/transformers.py | 2 +- haystack/nodes/file_converter/azure.py | 2 +- haystack/nodes/file_converter/base.py | 5 ++--- haystack/nodes/file_converter/parsr.py | 2 +- haystack/nodes/file_converter/pdf.py | 2 +- haystack/nodes/query_classifier/transformers.py | 2 +- haystack/nodes/ranker/base.py | 2 +- haystack/nodes/reader/farm.py | 4 ++-- haystack/nodes/retriever/_embedding_encoder.py | 2 +- haystack/nodes/retriever/dense.py | 6 +++--- haystack/nodes/translator/transformers.py | 2 +- haystack/pipelines/base.py | 8 ++++---- haystack/pipelines/config.py | 2 +- haystack/preview/testing/document_store.py | 7 +++---- haystack/schema.py | 2 +- haystack/testing/document_store.py | 4 ++-- haystack/utils/context_matching.py | 6 +++--- haystack/utils/deepsetcloud.py | 6 +++--- haystack/utils/preprocessing.py | 4 ++-- pyproject.toml | 5 ++++- test/benchmarks/datadog/metric_handler.py | 2 +- test/benchmarks/datadog/send_metrics.py | 2 +- test/benchmarks/utils.py | 4 ++-- test/document_stores/test_elasticsearch.py | 4 ++-- test/nodes/test_file_converter.py | 2 +- test/nodes/test_preprocessor.py | 2 +- test/nodes/test_reader.py | 2 +- test/nodes/test_shaper.py | 4 ++-- test/nodes/test_web_search.py | 2 +- test/others/test_utils.py | 2 +- test/pipelines/test_eval.py | 12 ++++++------ test/prompt/test_prompt_template.py | 8 ++++---- 43 files changed, 95 insertions(+), 93 deletions(-) diff --git a/e2e/modeling/test_dpr.py b/e2e/modeling/test_dpr.py index 1cee3ac87b..57016f2fe7 100644 --- a/e2e/modeling/test_dpr.py +++ b/e2e/modeling/test_dpr.py @@ -777,7 +777,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa # generate embeddings with model loaded from model hub dataset, tensor_names, _, __ = processor.dataset_from_dicts( - dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True + dicts=[d], indices=list(range(len([d]))), return_baskets=True ) data_loader = NamedDataLoader( @@ -811,7 +811,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa # generate embeddings with model loaded from disk dataset2, tensor_names2, _, __ = loaded_processor.dataset_from_dicts( - dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True + dicts=[d], indices=list(range(len([d]))), return_baskets=True ) data_loader = NamedDataLoader( @@ -820,7 +820,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa all_embeddings2: Dict[str, Any] = {"query": [], "passages": []} loaded_model.eval() - for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)): + for batch in tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True): batch = {key: batch[key].to(device) for key in batch} # get logits @@ -904,7 +904,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa # generate embeddings with model loaded from disk that originated from a FARM style model that was saved to disk earlier dataset3, tensor_names3, _, __ = loaded_processor.dataset_from_dicts( - dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True + dicts=[d], indices=list(range(len([d]))), return_baskets=True ) data_loader = NamedDataLoader( @@ -913,7 +913,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa all_embeddings3: Dict[str, Any] = {"query": [], "passages": []} loaded_model.eval() - for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)): + for batch in tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True): batch = {key: batch[key].to(device) for key in batch} # get logits diff --git a/examples/basic_qa_pipeline.py b/examples/basic_qa_pipeline.py index 9e90114d0e..b58e0ccfe9 100644 --- a/examples/basic_qa_pipeline.py +++ b/examples/basic_qa_pipeline.py @@ -1,16 +1,17 @@ import logging from pathlib import Path -logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) -logging.getLogger("haystack").setLevel(logging.INFO) - from haystack.document_stores import ElasticsearchDocumentStore -from haystack.utils import fetch_archive_from_http, print_answers, launch_es -from haystack.nodes import FARMReader, BM25Retriever +from haystack.nodes import BM25Retriever, FARMReader from haystack.nodes.file_classifier import FileTypeClassifier -from haystack.nodes.preprocessor import PreProcessor from haystack.nodes.file_converter import TextConverter +from haystack.nodes.preprocessor import PreProcessor from haystack.pipelines import Pipeline +from haystack.utils import fetch_archive_from_http, launch_es, print_answers + +# pylint: disable=no-logging-basicconfig +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) def basic_qa_pipeline(): @@ -22,7 +23,7 @@ def basic_qa_pipeline(): s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/wiki_gameofthrones_txt1.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) - file_paths = [p for p in Path(doc_dir).glob("**/*")] + file_paths = list(Path(doc_dir).glob("**/*")) files_metadata = [{"name": path.name} for path in file_paths] # Indexing Pipeline diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index a943dc35b1..514b6eae0d 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -595,7 +595,7 @@ def _drop_duplicate_documents(self, documents: List[Document], index: Optional[s :param index: name of the index :return: A list of Haystack Document objects. """ - _hash_ids: Set = set([]) + _hash_ids: Set = set() _documents: List[Document] = [] for document in documents: diff --git a/haystack/document_stores/opensearch.py b/haystack/document_stores/opensearch.py index 0b05bbb8be..4b41a9d741 100644 --- a/haystack/document_stores/opensearch.py +++ b/haystack/document_stores/opensearch.py @@ -1215,11 +1215,11 @@ def _get_embedding_field_mapping( def _ivf_model_exists(self, index: str) -> bool: if self._index_exists(".opensearch-knn-models"): response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search") - existing_ivf_models = set( + existing_ivf_models = { model["_source"]["model_id"] for model in response["hits"]["hits"] if model["_source"]["state"] != "failed" - ) + } else: existing_ivf_models = set() @@ -1461,7 +1461,7 @@ def _delete_ivf_model(self, index: str): """ if self._index_exists(".opensearch-knn-models"): response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search") - existing_ivf_models = set(model["_source"]["model_id"] for model in response["hits"]["hits"]) + existing_ivf_models = {model["_source"]["model_id"] for model in response["hits"]["hits"]} if f"{index}-ivf" in existing_ivf_models: self.client.transport.perform_request("DELETE", f"/_plugins/_knn/models/{index}-ivf") diff --git a/haystack/modeling/data_handler/dataset.py b/haystack/modeling/data_handler/dataset.py index 6c073a96b7..3a8df06283 100644 --- a/haystack/modeling/data_handler/dataset.py +++ b/haystack/modeling/data_handler/dataset.py @@ -27,7 +27,7 @@ def flatten_rename( assert any(key in encoded_batch for key in keys), f"one of the keys {keys} is not in batch {encoded_batch.keys()}" features_flat = [] for item in range(len(encoded_batch[keys[0]])): - feat_dict = {k: v for k, v in zip(renamed_keys, [encoded_batch[k][item] for k in keys])} + feat_dict = dict(zip(renamed_keys, [encoded_batch[k][item] for k in keys])) features_flat.append(feat_dict) return features_flat diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index 6d5cda47fb..3556419496 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -174,7 +174,7 @@ def load_from_dir(cls, load_dir: str): config = json.load(f) config["inference"] = True # init tokenizer - if "lower_case" in config.keys(): + if "lower_case" in config: logger.warning( "Loading tokenizer from deprecated config. " "If you used `custom_vocab` or `never_split_chars`, this won't work anymore." @@ -1249,7 +1249,7 @@ def _combine_title_context(titles: List[str], texts: List[str]): "Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '%s' ", ctx, ) - res.append(tuple((title, ctx))) + res.append((title, ctx)) return res @@ -1762,7 +1762,7 @@ def _combine_meta_context(meta_fields: List[str], texts: List[str]): for meta, ctx in zip(meta_fields, texts): if meta is None: meta = "" - res.append(tuple((meta, ctx))) + res.append((meta, ctx)) return res @@ -2111,12 +2111,12 @@ def dataset_from_dicts( truncation=True, max_length=self.max_seq_len, ) - names = [key for key in tokens] + names = list(tokens) inputs = [tokens[key] for key in tokens] - if not "padding_mask" in names: + if "padding_mask" not in names: index = names.index("attention_mask") names[index] = "padding_mask" - if not "segment_ids" in names: + if "segment_ids" not in names: index = names.index("token_type_ids") names[index] = "segment_ids" @@ -2149,7 +2149,7 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None dev_labels[q["id"]] = "is_impossible" else: dev_labels[q["id"]] = q["answers"][0]["text"] - not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys())) + not_included = dev_labels.keys() - predictions_json.keys() if len(not_included) > 0: logger.info("There were missing predictions for question ids: %s", list(not_included)) for x in not_included: diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 0ebcf072e0..3e717484d0 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -322,7 +322,7 @@ def forward( output2 = None embedding1, embedding2 = head(output1, output2) - all_logits.append(tuple([embedding1, embedding2])) + all_logits.append((embedding1, embedding2)) else: # just return LM output (e.g. useful for extracting embeddings at inference time) all_logits.append((pooled_output)) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index eaa83febd7..63582419b5 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -108,14 +108,14 @@ def output_dims(self): if self._output_dims: return self._output_dims - for odn in OUTPUT_DIM_NAMES: - try: + try: + for odn in OUTPUT_DIM_NAMES: value = getattr(self.model.config, odn, None) if value: self._output_dims = value return value - except AttributeError: - raise ModelingError("Can't get the output dimension before loading the model.") + except AttributeError: + raise ModelingError("Can't get the output dimension before loading the model.") raise ModelingError("Could not infer the output dimensions of the language model.") diff --git a/haystack/modeling/model/prediction_head.py b/haystack/modeling/model/prediction_head.py index 351e4045d0..aefb3eade2 100644 --- a/haystack/modeling/model/prediction_head.py +++ b/haystack/modeling/model/prediction_head.py @@ -732,7 +732,7 @@ def aggregate_preds(self, preds, passage_start_t, ids, seq_2_start_t=None, label all_basket_labels = {k: self.reduce_labels(v) for k, v in all_basket_labels.items()} # Return aggregated predictions in order as a list of lists - keys = [k for k in all_basket_preds] + keys = list(all_basket_preds) aggregated_preds = [all_basket_preds[k] for k in keys] if labels: labels = [all_basket_labels[k] for k in keys] diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 4e92b8ed90..5d831eee1a 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -283,7 +283,7 @@ def forward(self, **kwargs): output2 = None embedding1, embedding2 = head(output1, output2) - all_logits.append(tuple([embedding1, embedding2])) + all_logits.append((embedding1, embedding2)) else: # just return LM output (e.g. useful for extracting embeddings at inference time) all_logits.append((pooled_output)) diff --git a/haystack/nodes/base.py b/haystack/nodes/base.py index 7ba07174dc..a471b8c3f5 100644 --- a/haystack/nodes/base.py +++ b/haystack/nodes/base.py @@ -96,7 +96,7 @@ def name(self, value: str): @property def utilized_components(self) -> List[BaseComponent]: if "params" not in self._component_config: - return list() + return [] return [param for param in self._component_config["params"].values() if isinstance(param, BaseComponent)] @property @@ -229,7 +229,7 @@ def _dispatch_run_general(self, run_method: Callable, **kwargs): if "debug" in value.keys(): self.debug = value.pop("debug") - for _k, _v in value.items(): + for _k in value.keys(): if _k not in run_signature_args: raise Exception(f"Invalid parameter '{_k}' for the node '{self.name}'.") diff --git a/haystack/nodes/document_classifier/transformers.py b/haystack/nodes/document_classifier/transformers.py index 02ba6356fc..f5c2a84f6b 100644 --- a/haystack/nodes/document_classifier/transformers.py +++ b/haystack/nodes/document_classifier/transformers.py @@ -202,7 +202,7 @@ def predict(self, documents: List[Document], batch_size: Optional[int] = None) - formatted_prediction = { "label": prediction["labels"][0], "score": prediction["scores"][0], - "details": {label: score for label, score in zip(prediction["labels"], prediction["scores"])}, + "details": dict(zip(prediction["labels"], prediction["scores"])), } elif self.task == "text-classification": formatted_prediction = { diff --git a/haystack/nodes/file_converter/azure.py b/haystack/nodes/file_converter/azure.py index 8ddbcd0495..fca5811346 100644 --- a/haystack/nodes/file_converter/azure.py +++ b/haystack/nodes/file_converter/azure.py @@ -203,7 +203,7 @@ def _convert_tables_and_text( if not isinstance(table.content, pd.DataFrame): raise HaystackError("Document's content field must be of type 'pd.DataFrame'.") for _, row in table.content.iterrows(): - for _, cell in row.items(): + for cell in row.values(): file_text += f" {cell}" if not self.validate_language(file_text, valid_languages): logger.warning( diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py index 7bd24e04aa..d5fbb3fc58 100644 --- a/haystack/nodes/file_converter/base.py +++ b/haystack/nodes/file_converter/base.py @@ -202,15 +202,14 @@ def run( # type: ignore for file_path, file_meta in tqdm( zip(file_paths, meta), total=len(file_paths), disable=not self.progress_bar, desc="Converting files" ): - for doc in self.convert( + documents += self.convert( file_path=file_path, meta=file_meta, remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, encoding=encoding, id_hash_keys=id_hash_keys, - ): - documents.append(doc) + ) # Cleanup ligatures for document in documents: diff --git a/haystack/nodes/file_converter/parsr.py b/haystack/nodes/file_converter/parsr.py index dceca036f7..55217c2eb7 100644 --- a/haystack/nodes/file_converter/parsr.py +++ b/haystack/nodes/file_converter/parsr.py @@ -199,7 +199,7 @@ def convert( if not isinstance(table.content, pd.DataFrame): raise HaystackError("Document's content field must be of type 'pd.DataFrame'.") for _, row in table.content.iterrows(): - for _, cell in row.items(): + for cell in row.values(): file_text += f" {cell}" if not self.validate_language(file_text, valid_languages): logger.warning( diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index 5915541315..7ce6e6ccdd 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -286,7 +286,7 @@ def _read_pdf( document += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f" else: cpu = cpu_count() if isinstance(multiprocessing, bool) else multiprocessing - page_list = [i for i in range(start_page, end_page)] + page_list = list(range(start_page, end_page)) cpu = cpu if len(page_list) > cpu else len(page_list) parts = divide(cpu, page_list) pages_mp = [(i, file_path, parts, sort_by_position, ocr, ocr_language) for i in range(cpu)] diff --git a/haystack/nodes/query_classifier/transformers.py b/haystack/nodes/query_classifier/transformers.py index 7fb926e041..1369589608 100644 --- a/haystack/nodes/query_classifier/transformers.py +++ b/haystack/nodes/query_classifier/transformers.py @@ -126,7 +126,7 @@ def __init__( self.labels = labels if task == "text-classification": - labels_from_model = [label for label in self.model.model.config.id2label.values()] + labels_from_model = list(self.model.model.config.id2label.values()) if set(labels) != set(labels_from_model): raise ValueError( f"For text-classification, the provided labels must match the model labels; only the order can differ.\n" diff --git a/haystack/nodes/ranker/base.py b/haystack/nodes/ranker/base.py index 034936e40b..186cc88470 100644 --- a/haystack/nodes/ranker/base.py +++ b/haystack/nodes/ranker/base.py @@ -55,7 +55,7 @@ def _add_meta_fields_to_docs( for key in embed_meta_fields: if key in doc.meta and doc.meta[key]: if isinstance(doc.meta[key], list): - meta_data_fields.extend([item for item in doc.meta[key]]) + meta_data_fields.extend(list(doc.meta[key])) else: meta_data_fields.append(doc.meta[key]) # Convert to type string (e.g. for ints or floats) diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 5302051928..4e39d3d65e 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -1153,10 +1153,10 @@ def eval( } # Get rid of the question key again (after we aggregated we don't need it anymore) - d[str(doc_id)]["qas"] = [v for v in aggregated_per_question.values()] + d[str(doc_id)]["qas"] = list(aggregated_per_question.values()) # Convert input format for FARM - farm_input = [v for v in d.values()] + farm_input = list(d.values()) n_queries = len([y for x in farm_input for y in x["qas"]]) # Create DataLoader that can be passed to the Evaluator diff --git a/haystack/nodes/retriever/_embedding_encoder.py b/haystack/nodes/retriever/_embedding_encoder.py index cab092df3c..571a47499b 100644 --- a/haystack/nodes/retriever/_embedding_encoder.py +++ b/haystack/nodes/retriever/_embedding_encoder.py @@ -394,7 +394,7 @@ def embed(self, model: str, text: List[str]) -> np.ndarray: raise CohereUnauthorizedError(f"Invalid Cohere API key. {response.text}") if response.status_code != 200: raise CohereError(response.text, status_code=response.status_code) - generated_embeddings = [e for e in res["embeddings"]] + generated_embeddings = list(res["embeddings"]) return np.array(generated_embeddings) def embed_batch(self, text: List[str]) -> np.ndarray: diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index eea47327e3..eddff140ce 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -484,7 +484,7 @@ def _get_predictions(self, dicts: List[Dict[str, Any]]) -> Dict[str, np.ndarray] :return: dictionary of embeddings for "passages" and "query" """ dataset, tensor_names, _, _ = self.processor.dataset_from_dicts( - dicts, indices=[i for i in range(len(dicts))], return_baskets=True + dicts, indices=list(range(len(dicts))), return_baskets=True ) data_loader = NamedDataLoader( @@ -1113,7 +1113,7 @@ def _get_predictions(self, dicts: List[Dict[str, Any]]) -> Dict[str, np.ndarray] """ dataset, tensor_names, _, _ = self.processor.dataset_from_dicts( - dicts, indices=[i for i in range(len(dicts))], return_baskets=True + dicts, indices=list(range(len(dicts))), return_baskets=True ) data_loader = NamedDataLoader( @@ -1862,7 +1862,7 @@ def _preprocess_documents(self, docs: List[Document]) -> List[Document]: for key in self.embed_meta_fields: if key in doc.meta and doc.meta[key]: if isinstance(doc.meta[key], list): - meta_data_fields.extend([item for item in doc.meta[key]]) + meta_data_fields.extend(list(doc.meta[key])) else: meta_data_fields.append(doc.meta[key]) # Convert to type string (e.g. for ints or floats) diff --git a/haystack/nodes/translator/transformers.py b/haystack/nodes/translator/transformers.py index 8cc3384373..69e98ee631 100644 --- a/haystack/nodes/translator/transformers.py +++ b/haystack/nodes/translator/transformers.py @@ -168,7 +168,7 @@ def translate( return translated_texts[0] elif documents: if isinstance(documents, list) and isinstance(documents[0], str): - return [translated_text for translated_text in translated_texts] + return list(translated_texts) translated_documents: Union[ List[Document], List[Answer], List[str], List[Dict[str, Any]] diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index ea81f3e554..5d51528c8c 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -284,7 +284,7 @@ def save_to_deepset_cloud( index_config = index_pipeline.get_config() pipelines = query_config["pipelines"] + index_config["pipelines"] all_components = query_config["components"] + index_config["components"] - distinct_components = [c for c in {component["name"]: component for component in all_components}.values()] + distinct_components = list({component["name"]: component for component in all_components}.values()) document_stores = [c for c in distinct_components if c["type"].endswith("DocumentStore")] for document_store in document_stores: if document_store["type"] != "DeepsetCloudDocumentStore": @@ -827,10 +827,10 @@ def eval_beir( logger.info("Cropping dataset from %s to %s documents", len(corpus), num_documents) corpus = dict(itertools.islice(corpus.items(), num_documents)) # Remove queries that don't contain the remaining documents - corpus_ids = set(list(corpus.keys())) + corpus_ids = set(corpus.keys()) qrels_new = {} for query_id, document_rel_dict in qrels.items(): - document_rel_ids_intersection = list(corpus_ids & set(list(document_rel_dict.keys()))) + document_rel_ids_intersection = list(corpus_ids & set(document_rel_dict.keys())) # If there are no remaining documents related to the query, delete the query if len(document_rel_ids_intersection) == 0: del queries[query_id] @@ -1957,7 +1957,7 @@ def get_document_store(self) -> Optional[BaseDocumentStore]: matches = self.get_nodes_by_class(class_type=BaseDocumentStore) if len(matches) == 0: matches = list( - set(retriever.document_store for retriever in self.get_nodes_by_class(class_type=BaseRetriever)) + {retriever.document_store for retriever in self.get_nodes_by_class(class_type=BaseRetriever)} ) if len(matches) > 1: diff --git a/haystack/pipelines/config.py b/haystack/pipelines/config.py index 114330584d..10fef0f255 100644 --- a/haystack/pipelines/config.py +++ b/haystack/pipelines/config.py @@ -119,7 +119,7 @@ def build_component_dependency_graph( graph = nx.DiGraph() for component_name, component_definition in component_definitions.items(): params = component_definition.get("params", {}) - referenced_components: List[str] = list() + referenced_components: List[str] = [] for param_value in params.values(): # Currently we don't do any additional type validation here. # See https://github.com/deepset-ai/haystack/pull/2253#discussion_r815951591. diff --git a/haystack/preview/testing/document_store.py b/haystack/preview/testing/document_store.py index 9127fdab1c..e4ae2b6bb3 100644 --- a/haystack/preview/testing/document_store.py +++ b/haystack/preview/testing/document_store.py @@ -316,14 +316,13 @@ def test_nin_filter_embedding(self, docstore: DocumentStore, filterable_docs: Li result = docstore.filter_documents(filters={"embedding": {"$nin": [embedding_ones, embedding_zeros]}}) assert self.contains_same_docs( result, - [ - doc - for doc in filterable_docs + list( + filterable_docs or ( not np.array_equal(embedding_zeros, doc.embedding) # type: ignore and not np.array_equal(embedding_ones, doc.embedding) # type: ignore ) - ], + ), ) @pytest.mark.unit diff --git a/haystack/schema.py b/haystack/schema.py index a6547e5460..c4f7cbd1c2 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -1486,7 +1486,7 @@ def find_matched_label_idxs(row) -> List[int]: # pylint: disable=too-many-retur gold_document_ids = [id for id in gold_document_ids if id != "00"] num_labels = len(gold_document_ids) - num_matched_labels = len(set(idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs)) + num_matched_labels = len({idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs}) num_missing_labels = num_labels - num_matched_labels relevance_criterion_ids = list(relevant_rows["document_id"].values) diff --git a/haystack/testing/document_store.py b/haystack/testing/document_store.py index cba0d5950e..9e2f2ad928 100644 --- a/haystack/testing/document_store.py +++ b/haystack/testing/document_store.py @@ -73,8 +73,8 @@ def test_write_documents(self, ds, documents): ds.write_documents(documents) docs = ds.get_all_documents() assert len(docs) == len(documents) - expected_ids = set(doc.id for doc in documents) - ids = set(doc.id for doc in docs) + expected_ids = {doc.id for doc in documents} + ids = {doc.id for doc in docs} assert ids == expected_ids @pytest.mark.integration diff --git a/haystack/utils/context_matching.py b/haystack/utils/context_matching.py index f18980204f..80cfb1bd0b 100644 --- a/haystack/utils/context_matching.py +++ b/haystack/utils/context_matching.py @@ -147,7 +147,7 @@ def match_context( matches = (candidate for candidate in candidate_scores if candidate.score > threshold) sorted_matches = sorted(matches, key=lambda candidate: candidate.score, reverse=True) - match_list = list((candidate_score.candidate_id, candidate_score.score) for candidate_score in sorted_matches) + match_list = [(candidate_score.candidate_id, candidate_score.score) for candidate_score in sorted_matches] return match_list @@ -208,13 +208,13 @@ def match_contexts( if show_progress: candidate_scores = tqdm(candidate_scores) - match_lists: List[List[Tuple[str, float]]] = list() + match_lists: List[List[Tuple[str, float]]] = [] matches = (candidate for candidate in candidate_scores if candidate.score > threshold) group_sorted_matches = sorted(matches, key=lambda candidate: candidate.context_id) grouped_matches = groupby(group_sorted_matches, key=lambda candidate: candidate.context_id) for context_id, group in grouped_matches: sorted_group = sorted(group, key=lambda candidate: candidate.score, reverse=True) - match_list = list((candiate_score.candidate_id, candiate_score.score) for candiate_score in sorted_group) + match_list = [(candiate_score.candidate_id, candiate_score.score) for candiate_score in sorted_group] match_lists.insert(context_id, match_list) return match_lists diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index b215a22f12..24ab50b3c4 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -902,7 +902,7 @@ def get_evaluation_sets(self, workspace: Optional[str] = None) -> List[dict]: """ evaluation_sets_response = self._get_evaluation_sets(workspace=workspace) - return [eval_set for eval_set in evaluation_sets_response] + return list(evaluation_sets_response) def _get_evaluation_sets(self, workspace: Optional[str] = None) -> Generator: url = self._build_workspace_url(workspace=workspace) @@ -1166,7 +1166,7 @@ def get_eval_runs(self, workspace: Optional[str] = None, headers: Optional[dict] workspace_url = self._build_workspace_url(workspace) eval_run_url = f"{workspace_url}/eval_runs" response = self.client.get_with_auto_paging(eval_run_url, headers=headers) - return [eval_run for eval_run in response] + return list(response) def delete_eval_run(self, eval_run_name: str, workspace: Optional[str] = None, headers: Optional[dict] = None): """ @@ -1279,7 +1279,7 @@ def get_eval_run_predictions( workspace_url = self._build_workspace_url(workspace) eval_run_prediction_url = f"{workspace_url}/eval_runs/{eval_run_name}/nodes/{node_name}/predictions" response = self.client.get_with_auto_paging(eval_run_prediction_url, headers=headers) - return [prediction for prediction in response] + return list(response) def _build_workspace_url(self, workspace: Optional[str] = None): if workspace is None: diff --git a/haystack/utils/preprocessing.py b/haystack/utils/preprocessing.py index de80665dad..55b2cce9a3 100644 --- a/haystack/utils/preprocessing.py +++ b/haystack/utils/preprocessing.py @@ -34,7 +34,7 @@ def convert_files_to_docs( # Importing top-level causes a circular import from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter - file_paths = [p for p in Path(dir_path).glob("**/*")] + file_paths = list(Path(dir_path).glob("**/*")) allowed_suffixes = [".pdf", ".txt", ".docx"] suffix2converter: Dict[str, BaseConverter] = {} @@ -115,7 +115,7 @@ def tika_convert_files_to_docs( logger.error("Tika not installed. Please install tika and try again. Error: %s", ex) raise ex converter = TikaConverter() - paths = [p for p in Path(dir_path).glob("**/*")] + paths = list(Path(dir_path).glob("**/*")) allowed_suffixes = [".pdf", ".txt"] file_paths: List[Path] = [] diff --git a/pyproject.toml b/pyproject.toml index 3f5497f49c..c6a1373324 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -381,6 +381,7 @@ plugins = [ select = [ "AIR", # Airflow "ASYNC", # flake8-async + "C4", # flake8-comprehensions "C90", # McCabe cyclomatic complexity "CPY", # flake8-copyright "DJ", # flake8-django @@ -389,6 +390,7 @@ select = [ "F", # Pyflakes "FURB", # refurb "INT", # flake8-gettext + "PERF", # Perflint "PL", # Pylint "Q", # flake8-quotes "SLOT", # flake8-slots @@ -398,7 +400,6 @@ select = [ # "E", # pycodestyle # "NPY", # NumPy-specific rules # "PD", # pandas-vet - # "PERF", # Perflint # "PT", # flake8-pytest-style # "UP", # pyupgrade ] @@ -406,6 +407,8 @@ line-length = 1486 target-version = "py38" ignore = [ "F401", # unused-import + "PERF401", # Use a list comprehension to create a transformed list + "PERF203", # `try`-`except` within a loop incurs performance overhead "PLR1714", # repeated-equality-comparison "PLR5501", # collapsible-else-if "PLW0603", # global-statement diff --git a/test/benchmarks/datadog/metric_handler.py b/test/benchmarks/datadog/metric_handler.py index 2b0c3afc9a..d56f8d4929 100644 --- a/test/benchmarks/datadog/metric_handler.py +++ b/test/benchmarks/datadog/metric_handler.py @@ -118,7 +118,7 @@ def __init__(self, datadog_api_key: str, datadog_host: str): def send_custom_dd_metric(self, metric: CustomDatadogMetric) -> dict: datadog.initialize(api_key=self.datadog_api_key, api_host=self.datadog_host) - tags: List[str] = list(map(lambda t: str(t.value), metric.tags)) + tags: List[str] = [str(t.value) for t in metric.tags] post_metric_response: Dict = datadog.api.Metric.send( metric=metric.name, points=[(metric.timestamp, metric.value)], tags=tags ) diff --git a/test/benchmarks/datadog/send_metrics.py b/test/benchmarks/datadog/send_metrics.py index fcaefa1f5c..54d7346a4d 100644 --- a/test/benchmarks/datadog/send_metrics.py +++ b/test/benchmarks/datadog/send_metrics.py @@ -107,7 +107,7 @@ def get_benchmark_type_tag(reader_tag, retriever_tag, document_store_tag): def collect_metrics_from_json_files(folder_path): benchmark_metrics = parse_benchmark_files(folder_path) metrics_to_send_to_dd = [] - for benchmark_name, metrics in benchmark_metrics.items(): + for metrics in benchmark_metrics.values(): indexing_metrics = metrics["indexing"] querying_metrics = metrics["querying"] config = metrics["config"] diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index cfbf659458..8ef0ae438c 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -175,7 +175,7 @@ def contains_reader(pipeline: Pipeline) -> bool: Check if a pipeline contains a Reader component. :param pipeline: Pipeline """ - components = [comp for comp in pipeline.components.values()] + components = list(pipeline.components.values()) return any(isinstance(comp, BaseReader) for comp in components) @@ -183,5 +183,5 @@ def contains_retriever(pipeline: Pipeline) -> bool: """ Check if a pipeline contains a Retriever component. """ - components = [comp for comp in pipeline.components.values()] + components = list(pipeline.components.values()) return any(isinstance(comp, BaseRetriever) for comp in components) diff --git a/test/document_stores/test_elasticsearch.py b/test/document_stores/test_elasticsearch.py index cd73198a89..adfdeb0a82 100644 --- a/test/document_stores/test_elasticsearch.py +++ b/test/document_stores/test_elasticsearch.py @@ -283,8 +283,8 @@ def test_elasticsearch_brownfield_support(self, ds, documents): assert all("name" in doc.meta for doc in transferred_documents) assert all(doc.id == doc._get_id(["content", "meta"]) for doc in transferred_documents) - original_content = set([doc.content for doc in original_documents]) - transferred_content = set([doc.content for doc in transferred_documents]) + original_content = {doc.content for doc in original_documents} + transferred_content = {doc.content for doc in transferred_documents} assert original_content == transferred_content # Test transferring docs with PreProcessor diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index 53c726b313..d1bbb2fc1b 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -394,7 +394,7 @@ def test_id_hash_keys_from_pipeline_params(samples_path): converter = TextConverter() output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"]) documents = output["documents"] - unique_ids = set(d.id for d in documents) + unique_ids = {d.id for d in documents} assert len(documents) == 2 assert len(unique_ids) == 2 diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py index 9b3f9c8d48..222841a54f 100644 --- a/test/nodes/test_preprocessor.py +++ b/test/nodes/test_preprocessor.py @@ -237,7 +237,7 @@ def test_id_hash_keys_from_pipeline_params(): preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False) output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"]) documents = output["documents"] - unique_ids = set(d.id for d in documents) + unique_ids = {d.id for d in documents} assert len(documents) == 4 assert len(unique_ids) == 4 diff --git a/test/nodes/test_reader.py b/test/nodes/test_reader.py index bb77e57e41..c97bf418f5 100644 --- a/test/nodes/test_reader.py +++ b/test/nodes/test_reader.py @@ -160,7 +160,7 @@ def test_deduplication_for_overlapping_documents(reader): prediction = reader.predict(query="Where does Carla live?", documents=docs, top_k=5) # Check that there are no duplicate answers - assert len(set(ans.answer for ans in prediction["answers"])) == len(prediction["answers"]) + assert len({ans.answer for ans in prediction["answers"]}) == len(prediction["answers"]) @pytest.mark.integration diff --git a/test/nodes/test_shaper.py b/test/nodes/test_shaper.py index 4c4fc3923e..4d494de052 100644 --- a/test/nodes/test_shaper.py +++ b/test/nodes/test_shaper.py @@ -937,7 +937,7 @@ def test_strings_to_answers_after_prompt_node_yaml(tmp_path): ) results = result["answers"] assert len(results) == 4 - assert any([True for r in results if "Berlin" in r.answer]) + assert any(True for r in results if "Berlin" in r.answer) for answer in results[:2]: assert answer.document_ids == ["123"] assert ( @@ -1527,7 +1527,7 @@ def test_with_multiple_prompt_nodes(tmp_path): ) results = result["answers"] assert len(results) == 2 - assert any([True for r in results if "Berlin" in r.answer]) + assert any(True for r in results if "Berlin" in r.answer) @pytest.mark.unit diff --git a/test/nodes/test_web_search.py b/test/nodes/test_web_search.py index 70c3c9c0f5..9e9ce1f9dd 100644 --- a/test/nodes/test_web_search.py +++ b/test/nodes/test_web_search.py @@ -39,7 +39,7 @@ def test_web_search_with_site_keyword(): assert len(result["documents"]) > 0 assert isinstance(result["documents"][0], Document) assert all( - ["nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"]] + "nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"] ), "Some documents are not from the specified sites lifewire.com or nasa.gov." diff --git a/test/others/test_utils.py b/test/others/test_utils.py index 31690c7962..c3382b8dd1 100644 --- a/test/others/test_utils.py +++ b/test/others/test_utils.py @@ -546,7 +546,7 @@ def test_list_files_on_deepset_cloud(): ) client = DeepsetCloud.get_file_client(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY) - files = [f for f in client.list_files()] + files = list(client.list_files()) assert len(files) == 2 assert files[0]["name"] == "sample_pdf_1.pdf" assert files[1]["name"] == "sample_pdf_2.pdf" diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index 73aa2ae6bd..19b952c14d 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -537,10 +537,10 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path, eval_labels): # all expected columns are part of the evaluation result dataframe assert sorted(expected_reader_result_columns + expected_generic_result_columns + ["index"]) == sorted( - list(reader_result.columns) + reader_result.columns ) assert sorted(expected_retriever_result_columns + expected_generic_result_columns + ["index"]) == sorted( - list(retriever_result.columns) + retriever_result.columns ) assert ( @@ -676,10 +676,10 @@ def test_generative_qa_eval(retriever_with_docs, tmp_path, eval_labels): # all expected columns are part of the evaluation result dataframe assert sorted(expected_generator_result_columns + expected_generic_result_columns + ["index"]) == sorted( - list(generator_result.columns) + generator_result.columns ) assert sorted(expected_retriever_result_columns + expected_generic_result_columns + ["index"]) == sorted( - list(retriever_result.columns) + retriever_result.columns ) assert generator_result["prompt"].iloc[0] is not None @@ -777,10 +777,10 @@ def test_generative_qa_w_promptnode_eval(retriever_with_docs, tmp_path, eval_lab # all expected columns are part of the evaluation result dataframe assert sorted(expected_generator_result_columns + expected_generic_result_columns + ["index"]) == sorted( - list(generator_result.columns) + generator_result.columns ) assert sorted(expected_retriever_result_columns + expected_generic_result_columns + ["index"]) == sorted( - list(retriever_result.columns) + retriever_result.columns ) assert generator_result["prompt"].iloc[0] is not None diff --git a/test/prompt/test_prompt_template.py b/test/prompt/test_prompt_template.py index cf8254d3d4..78c458a17a 100644 --- a/test/prompt/test_prompt_template.py +++ b/test/prompt/test_prompt_template.py @@ -345,7 +345,7 @@ def test_prompt_template_syntax_fill( self, prompt_text: str, documents: List[Document], query: str, expected_prompts: List[str] ): prompt_template = PromptTemplate(prompt_text) - prompts = [prompt for prompt in prompt_template.fill(documents=documents, query=query)] + prompts = list(prompt_template.fill(documents=documents, query=query)) assert prompts == expected_prompts @pytest.mark.unit @@ -372,7 +372,7 @@ def test_prompt_template_syntax_fill( ) def test_join(self, prompt_text: str, documents: List[Document], expected_prompts: List[str]): prompt_template = PromptTemplate(prompt_text) - prompts = [prompt for prompt in prompt_template.fill(documents=documents)] + prompts = list(prompt_template.fill(documents=documents)) assert prompts == expected_prompts @pytest.mark.unit @@ -405,7 +405,7 @@ def test_join(self, prompt_text: str, documents: List[Document], expected_prompt ) def test_to_strings(self, prompt_text: str, documents: List[Document], expected_prompts: List[str]): prompt_template = PromptTemplate(prompt_text) - prompts = [prompt for prompt in prompt_template.fill(documents=documents)] + prompts = list(prompt_template.fill(documents=documents)) assert prompts == expected_prompts @pytest.mark.unit @@ -466,7 +466,7 @@ def test_prompt_template_syntax_fill_ignores_dangerous_input( self, prompt_text: str, documents: List[Document], query: str, expected_prompts: List[str] ): prompt_template = PromptTemplate(prompt_text) - prompts = [prompt for prompt in prompt_template.fill(documents=documents, query=query)] + prompts = list(prompt_template.fill(documents=documents, query=query)) assert prompts == expected_prompts def test_prompt_template_remove_template_params(self): From 564556a509a7b628b06b5c0f79370d9225e20513 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 15 Sep 2023 18:48:20 +0200 Subject: [PATCH 2/6] pre-commit fixes --- .pre-commit-config.yaml | 2 +- releasenotes/notes/refactor-pinecone-document-store.yaml | 6 +++--- .../support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23464f67fa..a973508ad8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.0.289 hooks: - - id: ruff + - id: ruff - repo: https://github.com/codespell-project/codespell rev: v2.2.5 diff --git a/releasenotes/notes/refactor-pinecone-document-store.yaml b/releasenotes/notes/refactor-pinecone-document-store.yaml index d67d134a34..b8145ac50b 100644 --- a/releasenotes/notes/refactor-pinecone-document-store.yaml +++ b/releasenotes/notes/refactor-pinecone-document-store.yaml @@ -1,6 +1,6 @@ --- enhancements: - | - Refactor PineconeDocumentStore to use metadata instead of namespaces - for distinction between documents with embeddings, documents without - embeddings and labels \ No newline at end of file + Refactor PineconeDocumentStore to use metadata instead of namespaces + for distinction between documents with embeddings, documents without + embeddings and labels diff --git a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml index 642831b752..9fc499418c 100644 --- a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml +++ b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml @@ -2,4 +2,3 @@ fixes: - | gpt-35-turbo-16k model from Azure can integrate correctly - From 6788fc908f7fb1979a992ddbb732bef6ded3b5e8 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 15 Sep 2023 19:14:04 +0200 Subject: [PATCH 3/6] Revert changes to examples/basic_qa_pipeline.py --- examples/basic_qa_pipeline.py | 15 +++++++-------- pyproject.toml | 1 + 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/basic_qa_pipeline.py b/examples/basic_qa_pipeline.py index b58e0ccfe9..9e90114d0e 100644 --- a/examples/basic_qa_pipeline.py +++ b/examples/basic_qa_pipeline.py @@ -1,17 +1,16 @@ import logging from pathlib import Path +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) + from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import BM25Retriever, FARMReader +from haystack.utils import fetch_archive_from_http, print_answers, launch_es +from haystack.nodes import FARMReader, BM25Retriever from haystack.nodes.file_classifier import FileTypeClassifier -from haystack.nodes.file_converter import TextConverter from haystack.nodes.preprocessor import PreProcessor +from haystack.nodes.file_converter import TextConverter from haystack.pipelines import Pipeline -from haystack.utils import fetch_archive_from_http, launch_es, print_answers - -# pylint: disable=no-logging-basicconfig -logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) -logging.getLogger("haystack").setLevel(logging.INFO) def basic_qa_pipeline(): @@ -23,7 +22,7 @@ def basic_qa_pipeline(): s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/wiki_gameofthrones_txt1.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) - file_paths = list(Path(doc_dir).glob("**/*")) + file_paths = [p for p in Path(doc_dir).glob("**/*")] files_metadata = [{"name": path.name} for path in file_paths] # Indexing Pipeline diff --git a/pyproject.toml b/pyproject.toml index c6a1373324..e4ab823355 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -420,6 +420,7 @@ ignore = [ max-complexity = 28 [tool.ruff.per-file-ignores] +"examples/basic_qa_pipeline.py" = ["C416"] "haystack/preview/testing/document_store.py" = ["F821"] "haystack/telemetry.py" = ["F821"] From 97c4c2cf7702cee090bf7f804c24474399752faa Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 15 Sep 2023 19:36:09 +0200 Subject: [PATCH 4/6] Revert changes to haystack/preview/testing/document_store.py --- haystack/preview/testing/document_store.py | 7 ++++--- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/haystack/preview/testing/document_store.py b/haystack/preview/testing/document_store.py index e4ae2b6bb3..9127fdab1c 100644 --- a/haystack/preview/testing/document_store.py +++ b/haystack/preview/testing/document_store.py @@ -316,13 +316,14 @@ def test_nin_filter_embedding(self, docstore: DocumentStore, filterable_docs: Li result = docstore.filter_documents(filters={"embedding": {"$nin": [embedding_ones, embedding_zeros]}}) assert self.contains_same_docs( result, - list( - filterable_docs + [ + doc + for doc in filterable_docs or ( not np.array_equal(embedding_zeros, doc.embedding) # type: ignore and not np.array_equal(embedding_ones, doc.embedding) # type: ignore ) - ), + ], ) @pytest.mark.unit diff --git a/pyproject.toml b/pyproject.toml index e4ab823355..144473722b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -421,7 +421,7 @@ max-complexity = 28 [tool.ruff.per-file-ignores] "examples/basic_qa_pipeline.py" = ["C416"] -"haystack/preview/testing/document_store.py" = ["F821"] +"haystack/preview/testing/document_store.py" = ["C416", "F821"] "haystack/telemetry.py" = ["F821"] [tool.ruff.pylint] From c884e2552341a3c2d103c70aa6d7b79ddc4709c7 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 16 Sep 2023 00:43:06 +0200 Subject: [PATCH 5/6] revert releasenotes --- releasenotes/notes/refactor-pinecone-document-store.yaml | 6 +++--- .../support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/releasenotes/notes/refactor-pinecone-document-store.yaml b/releasenotes/notes/refactor-pinecone-document-store.yaml index b8145ac50b..d67d134a34 100644 --- a/releasenotes/notes/refactor-pinecone-document-store.yaml +++ b/releasenotes/notes/refactor-pinecone-document-store.yaml @@ -1,6 +1,6 @@ --- enhancements: - | - Refactor PineconeDocumentStore to use metadata instead of namespaces - for distinction between documents with embeddings, documents without - embeddings and labels + Refactor PineconeDocumentStore to use metadata instead of namespaces + for distinction between documents with embeddings, documents without + embeddings and labels \ No newline at end of file diff --git a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml index 9fc499418c..642831b752 100644 --- a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml +++ b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml @@ -2,3 +2,4 @@ fixes: - | gpt-35-turbo-16k model from Azure can integrate correctly + From 40f3efde4e21f090b17983f67f0bea3ba83398a1 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 16 Sep 2023 09:19:12 +0200 Subject: [PATCH 6/6] Upgrade to ruff v0.0.290 --- .pre-commit-config.yaml | 2 +- haystack/nodes/other/join_docs.py | 2 +- haystack/nodes/retriever/sparse.py | 5 +---- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a973508ad8..5375c94d75 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: black-jupyter - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.289 + rev: v0.0.290 hooks: - id: ruff diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 27761535ce..4185873a7c 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -82,7 +82,7 @@ def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None): "score would be `-infinity`." ) else: - sorted_docs = [(k, v) for k, v in scores_map.items()] + sorted_docs = list(scores_map.items()) if not top_k_join: top_k_join = self.top_k_join diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index 61f7b0c8d0..c32dddf0a6 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -457,10 +457,7 @@ def _get_all_paragraphs(self, document_store: BaseDocumentStore, index: Optional def _calc_scores(self, queries: List[str], index: str) -> List[Dict[int, float]]: question_vector = self.vectorizer.transform(queries) doc_scores_per_query = self.tfidf_matrices[index].dot(question_vector.T).T.toarray() - doc_scores_per_query = [ - [(doc_idx, doc_score) for doc_idx, doc_score in enumerate(doc_scores)] - for doc_scores in doc_scores_per_query - ] + doc_scores_per_query = [list(enumerate(doc_scores)) for doc_scores in doc_scores_per_query] indices_and_scores: List[Dict] = [ OrderedDict(sorted(query_idx_scores, key=lambda tup: tup[1], reverse=True)) for query_idx_scores in doc_scores_per_query