From 2174a5107dff588caad8ade98d223ac4202bcef7 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 2 Mar 2021 04:53:02 +0530 Subject: [PATCH] util: crypto: Add secure/insecure_hash functions Remove usages of hashlib throughout the codebase to make auditing easier. Fixes: #297 --- CHANGELOG.md | 2 + configloader/image/tests/test_config.py | 8 +- dffml/cli/dataflow.py | 143 +++++++----------- dffml/df/memory.py | 4 +- dffml/model/model.py | 1 - dffml/util/crypto.py | 20 +++ dffml/util/file.py | 5 +- .../scikit/dffml_model_scikit/scikit_base.py | 8 +- .../tensorflow/dffml_model_tensorflow/dnnc.py | 1 - .../text_classifier.py | 4 +- .../dffml_model_vowpalWabbit/vw_base.py | 10 +- tests/source/test_idx.py | 11 +- 12 files changed, 103 insertions(+), 114 deletions(-) create mode 100644 dffml/util/crypto.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 40ead0aa4b..6b58f8b40c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Tutorial on how to load models dynamically https://intel.github.io/dffml/tutorials/models/load.html - Added download progressbar in `util/net.py` +### Changed +- Calls to hashlib now go through helper functions ### Fixed - Record object key properties are now always strings diff --git a/configloader/image/tests/test_config.py b/configloader/image/tests/test_config.py index a2b181c690..6084b0a989 100644 --- a/configloader/image/tests/test_config.py +++ b/configloader/image/tests/test_config.py @@ -1,8 +1,8 @@ import json import pathlib -import hashlib from dffml.util.asynctestcase import AsyncTestCase +from dffml.util.crypto import secure_hash from dffml_config_image.configloader import PNGConfigLoader IMAGE1_HASH = "6faf9050c6d387bc6a68d9e12127f883011add2ec994b8e66c7c0996636f2789af8d28fc11e6528a327a6383c1473e72" @@ -22,8 +22,8 @@ async def test_dumpb_loadb(self): / "image1.png" ).read_bytes() original = await ctx.loadb(image_bytes) - hash_original = hashlib.sha384( - json.dumps(original.flatten().tolist()).encode() - ).hexdigest() + hash_original = secure_hash( + json.dumps(original.flatten().tolist()), algorithm="sha384" + ) self.assertEqual(original.shape, (280, 280, 3)) self.assertEqual(hash_original, IMAGE1_HASH) diff --git a/dffml/cli/dataflow.py b/dffml/cli/dataflow.py index 8edae433f1..00e56251a9 100644 --- a/dffml/cli/dataflow.py +++ b/dffml/cli/dataflow.py @@ -1,5 +1,4 @@ import pathlib -import hashlib import contextlib from typing import List, Dict, Any @@ -27,6 +26,7 @@ KeysCMDConfig, ) from ..util.cli.parser import ParseInputsAction +from ..util.crypto import insecure_hash from ..base import config, field from ..high_level import run as run_dataflow @@ -488,45 +488,37 @@ async def run(self): # Skip stage if not wanted if self.stages and stage.value not in self.stages: continue - stage_node = hashlib.md5( - ("stage." + stage.value).encode() - ).hexdigest() + stage_node = insecure_hash("stage." + stage.value) if len(self.stages) != 1: print(f"subgraph {stage_node}[{stage.value.title()} Stage]") print(f"style {stage_node} fill:#afd388b5,stroke:#a4ca7a") for instance_name, operation in dataflow.operations.items(): if operation.stage != stage: continue - subgraph_node = hashlib.md5( - ("subgraph." + instance_name).encode() - ).hexdigest() - node = hashlib.md5(instance_name.encode()).hexdigest() + subgraph_node = insecure_hash("subgraph." + instance_name) + node = insecure_hash(instance_name) if not self.simple: print(f"subgraph {subgraph_node}[{instance_name}]") print(f"style {subgraph_node} fill:#fff4de,stroke:#cece71") print(f"{node}[{operation.instance_name}]") for input_name in operation.inputs.keys(): - input_node = hashlib.md5( - ("input." + instance_name + "." + input_name).encode() - ).hexdigest() + input_node = insecure_hash( + "input." + instance_name + "." + input_name + ) if not self.simple: print(f"{input_node}({input_name})") print(f"{input_node} --> {node}") for output_name in operation.outputs.keys(): - output_node = hashlib.md5( - ( - "output." + instance_name + "." + output_name - ).encode() - ).hexdigest() + output_node = insecure_hash( + "output." + instance_name + "." + output_name + ) if not self.simple: print(f"{output_node}({output_name})") print(f"{node} --> {output_node}") for condition in operation.conditions: - condition_node = hashlib.md5( - ( - "condition." + instance_name + "." + condition.name - ).encode() - ).hexdigest() + condition_node = insecure_hash( + "condition." + instance_name + "." + condition.name + ) if not self.simple: print(f"{condition_node}{'{' + condition.name + '}'}") print(f"{condition_node} --> {node}") @@ -541,15 +533,15 @@ async def run(self): operation = dataflow.operations[instance_name] if self.stages and not operation.stage.value in self.stages: continue - node = hashlib.md5(instance_name.encode()).hexdigest() + node = insecure_hash(instance_name) for input_name, sources in input_flow.inputs.items(): for source in sources: # TODO Put various sources in their own "Inputs" subgraphs if isinstance(source, str): input_definition = operation.inputs[input_name] - seed_input_node = hashlib.md5( - (source + "." + input_definition.name).encode() - ).hexdigest() + seed_input_node = insecure_hash( + source + "." + input_definition.name + ) print( f"{seed_input_node}({source}
{input_definition.name})" ) @@ -558,11 +550,9 @@ async def run(self): f"style {seed_input_node} fill:#f6dbf9,stroke:#a178ca" ) if not self.simple: - input_node = hashlib.md5( - ( - "input." + instance_name + "." + input_name - ).encode() - ).hexdigest() + input_node = insecure_hash( + "input." + instance_name + "." + input_name + ) print(f"{seed_input_node} --> {input_node}") else: print(f"{seed_input_node} --> {node}") @@ -577,9 +567,9 @@ async def run(self): origin_definition_name = ( origin + "." + definition_name ) - seed_input_node = hashlib.md5( - origin_definition_name.encode() - ).hexdigest() + seed_input_node = insecure_hash( + origin_definition_name.encode + ) print( f"{seed_input_node}({source}
{origin_definition_name})" ) @@ -588,14 +578,9 @@ async def run(self): f"style {seed_input_node} fill:#f6dbf9,stroke:#a178ca" ) if not self.simple: - input_node = hashlib.md5( - ( - "input." - + instance_name - + "." - + input_name - ).encode() - ).hexdigest() + input_node = insecure_hash( + "input." + instance_name + "." + input_name + ) print(f"{seed_input_node} --> {input_node}") else: print(f"{seed_input_node} --> {node}") @@ -612,67 +597,51 @@ async def run(self): ): source = source[0] if not self.simple: - source_output_node = hashlib.md5( - ( - "output." - + ".".join(list(source.items())[0]) - ).encode() - ).hexdigest() - input_node = hashlib.md5( - ( - "input." + instance_name + "." + input_name - ).encode() - ).hexdigest() + source_output_node = insecure_hash( + "output." + ".".join(list(source.items())[0]) + ) + input_node = insecure_hash( + "input." + instance_name + "." + input_name + ) + print(f"{source_output_node} --> {input_node}") else: - source_operation_node = hashlib.md5( - list(source.keys())[0].encode() - ).hexdigest() + source_operation_node = insecure_hash( + list(source.keys())[0] + ) print(f"{source_operation_node} --> {node}") for i, condition in enumerate(input_flow.conditions): if isinstance(condition, str): if not self.simple: condition_name = operation.conditions[i].name - seed_condition_node = hashlib.md5( - (condition + "." + condition_name).encode() - ).hexdigest() + seed_condition_node = insecure_hash( + condition + "." + condition_name + ) print(f"{seed_condition_node}({condition_name})") - seed_dependent_node = hashlib.md5( - ( - "condition." - + instance_name - + "." - + condition_name - ).encode() - ).hexdigest() + seed_dependent_node = insecure_hash( + "condition." + instance_name + "." + condition_name + ) print( f"{seed_condition_node} --> {seed_dependent_node}" ) else: if not self.simple: - dependee_node = hashlib.md5( - ( - "output." - + ".".join(list(condition.items())[0]) - ).encode() - ).hexdigest() - dependent_node = hashlib.md5( - ( - "condition." - + instance_name - + "." - + dataflow.operations[ - list(condition.keys())[0] - ] - .outputs[list(condition.values())[0]] - .name - ).encode() - ).hexdigest() + dependee_node = insecure_hash( + "output." + ".".join(list(condition.items())[0]) + ) + dependent_node = insecure_hash( + "condition." + + instance_name + + "." + + dataflow.operations[list(condition.keys())[0]] + .outputs[list(condition.values())[0]] + .name + ) print(f"{dependee_node} --> {dependent_node}") else: - dependee_operation_node = hashlib.md5( - list(condition.keys())[0].encode() - ).hexdigest() + dependee_operation_node = insecure_hash( + list(condition.keys())[0] + ) print(f"{dependee_operation_node} --> {node}") if len(self.stages) != 1: print(f"end") diff --git a/dffml/df/memory.py b/dffml/df/memory.py index a3df8ce358..876d1ddd9b 100644 --- a/dffml/df/memory.py +++ b/dffml/df/memory.py @@ -3,7 +3,6 @@ import copy import asyncio import secrets -import hashlib import inspect import itertools import traceback @@ -76,6 +75,7 @@ from ..util.cli.arg import Arg from ..util.data import ignore_args from ..util.asynchelper import aenter_stack, concurrently +from ..util.crypto import secure_hash from .log import LOGGER @@ -859,7 +859,7 @@ def _unique(instance_name: str, handle: str, *uids: str) -> str: operation.instance_name, and the sorted list of input uuids. """ uid_list = [instance_name, handle] + sorted(uids) - return hashlib.sha384("".join(uid_list).encode("utf-8")).hexdigest() + return secure_hash("".join(uid_list), "sha384") async def unique( self, operation: Operation, parameter_set: BaseParameterSet diff --git a/dffml/model/model.py b/dffml/model/model.py index b7df86d477..7eb4a2ca1e 100644 --- a/dffml/model/model.py +++ b/dffml/model/model.py @@ -7,7 +7,6 @@ """ import abc import json -import hashlib import pathlib from typing import AsyncIterator, Optional diff --git a/dffml/util/crypto.py b/dffml/util/crypto.py new file mode 100644 index 0000000000..168821d5e4 --- /dev/null +++ b/dffml/util/crypto.py @@ -0,0 +1,20 @@ +""" +All hashing originates in this file for easier auditing. +""" +import hashlib + + +SECURE_HASH_ALGORITHM = hashlib.sha384 +INSECURE_HASH_ALGORITHM = hashlib.md5 + + +def secure_hash(string, algorithm) -> str: + if isinstance(string, str): + string = string.encode("utf-8") + return SECURE_HASH_ALGORITHM(string).hexdigest() + + +def insecure_hash(string) -> str: + if isinstance(string, str): + string = string.encode("utf-8") + return INSECURE_HASH_ALGORITHM(string).hexdigest() diff --git a/dffml/util/file.py b/dffml/util/file.py index f75252ee4f..e1ed894444 100644 --- a/dffml/util/file.py +++ b/dffml/util/file.py @@ -1,8 +1,9 @@ import re -import hashlib import pathlib from typing import Union, Tuple +from .crypto import SECURE_HASH_ALGORITHM + class NoHashToUseForValidationSuppliedError(Exception): """ @@ -81,7 +82,7 @@ def validate_file_hash( filepath = pathlib.Path(filepath) if expected_sha384_hash is None: raise NoHashToUseForValidationSuppliedError(filepath) - filehash = hashlib.sha384() + filehash = SECURE_HASH_ALGORITHM() with open(filepath, "rb") as fileobj: bytes_read = fileobj.read(chunk_size) filehash.update(bytes_read) diff --git a/model/scikit/dffml_model_scikit/scikit_base.py b/model/scikit/dffml_model_scikit/scikit_base.py index 5c0ebcdfb4..a69a2c0362 100644 --- a/model/scikit/dffml_model_scikit/scikit_base.py +++ b/model/scikit/dffml_model_scikit/scikit_base.py @@ -4,7 +4,6 @@ Base class for Scikit models """ import json -import hashlib import pathlib import logging import importlib @@ -30,6 +29,7 @@ from dffml.source.source import Sources, SourcesContext from dffml.model.model import ModelConfig, ModelContext, Model, ModelNotTrained from dffml.feature.feature import Features, Feature +from dffml.util.crypto import secure_hash class ScikitConfig(ModelConfig, NamedTuple): @@ -64,9 +64,9 @@ def _feature_predict_hash(self): if k not in ["features", "tcluster", "predict"] ] ) - return hashlib.sha384( - "".join([params] + self.features).encode() - ).hexdigest() + return secure_hash( + "".join([params] + self.features), algorithm="sha384" + ) @property def _filepath(self): diff --git a/model/tensorflow/dffml_model_tensorflow/dnnc.py b/model/tensorflow/dffml_model_tensorflow/dnnc.py index 1f5b2e41fa..a4771d22a2 100644 --- a/model/tensorflow/dffml_model_tensorflow/dnnc.py +++ b/model/tensorflow/dffml_model_tensorflow/dnnc.py @@ -4,7 +4,6 @@ """ import os import abc -import hashlib import inspect import pathlib diff --git a/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py b/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py index 5dc3ee0ada..e8530ecf42 100644 --- a/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py +++ b/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py @@ -5,7 +5,6 @@ """ # TODO Add docstrings import os -import hashlib import pathlib import importlib from typing import AsyncIterator, Tuple, Any, List, Type @@ -17,6 +16,7 @@ from dffml.record import Record from dffml.model.accuracy import Accuracy from dffml.util.entrypoint import entrypoint +from dffml.util.crypto import secure_hash from dffml.base import config, field from dffml.feature.feature import Feature, Features from dffml.source.source import Sources, SourcesContext @@ -118,7 +118,7 @@ def _model_dir_path(self): str(len(self.cids)), self.parent.config.model_path, ] - model = hashlib.sha384("".join(_to_hash).encode("utf-8")).hexdigest() + model = secure_hash("".join(_to_hash), algorithm="sha384") # Needed to save updated model if not os.path.isdir(self.parent.config.directory): raise NotADirectoryError( diff --git a/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py b/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py index 607be283dd..71ddfe8744 100644 --- a/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py +++ b/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py @@ -5,7 +5,6 @@ """ import os import json -import hashlib from pathlib import Path from typing import AsyncIterator, Tuple, Any, List @@ -19,6 +18,7 @@ from dffml.model.accuracy import Accuracy from dffml.source.source import Sources, SourcesContext from dffml.util.entrypoint import entrypoint +from dffml.util.crypto import secure_hash from dffml.base import config, field from dffml.feature.feature import Features, Feature from dffml.model.model import ModelContext, Model, ModelNotTrained @@ -141,9 +141,9 @@ def _feature_predict_hash(self): ] ) params = "".join(params) - return hashlib.sha384( - "".join([params] + self.features).encode() - ).hexdigest() + return secure_hash( + "".join([params] + self.features), algorithm="sha384" + ) def applicable_features(self, features): usable = [] @@ -449,7 +449,7 @@ def __init__(self, config) -> None: def _filename(self): return os.path.join( self.config.directory, - hashlib.sha384(self.config.predict.name.encode()).hexdigest() + secure_hash(self.config.predict.name, algorithm="sha384") + ".json", ) diff --git a/tests/source/test_idx.py b/tests/source/test_idx.py index d0def76b40..a1e3df14b7 100644 --- a/tests/source/test_idx.py +++ b/tests/source/test_idx.py @@ -1,8 +1,8 @@ import json import pathlib -import hashlib from dffml.util.net import cached_download +from dffml.util.crypto import secure_hash from dffml.util.asynctestcase import AsyncTestCase from dffml.source.idx1 import IDX1SourceConfig, IDX1Source @@ -51,9 +51,8 @@ async def test_idx3(self, filename): self.assertIn(feature_name, records[0].features()) for i in range(-1, 1): with self.subTest(index=i): - is_hash = hashlib.sha384( - json.dumps( - records[i].feature(feature_name) - ).encode() - ).hexdigest() + is_hash = secure_hash( + json.dumps(records[i].feature(feature_name)), + algorithm="sha384", + ) self.assertEqual(is_hash, IDX3_FIRST_LAST[i])