Skip to content
This repository has been archived by the owner on Aug 25, 2024. It is now read-only.

Commit

Permalink
util: crypto: Add secure/insecure_hash functions
Browse files Browse the repository at this point in the history
Remove usages of hashlib throughout the codebase to make auditing
easier.

Fixes: #297
  • Loading branch information
sanjibansg authored and pdxjohnny committed Mar 27, 2021
1 parent cfadbd0 commit 2174a51
Show file tree
Hide file tree
Showing 12 changed files with 103 additions and 114 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Tutorial on how to load models dynamically
https://intel.github.io/dffml/tutorials/models/load.html
- Added download progressbar in `util/net.py`
### Changed
- Calls to hashlib now go through helper functions
### Fixed
- Record object key properties are now always strings

Expand Down
8 changes: 4 additions & 4 deletions configloader/image/tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import pathlib
import hashlib

from dffml.util.asynctestcase import AsyncTestCase
from dffml.util.crypto import secure_hash
from dffml_config_image.configloader import PNGConfigLoader

IMAGE1_HASH = "6faf9050c6d387bc6a68d9e12127f883011add2ec994b8e66c7c0996636f2789af8d28fc11e6528a327a6383c1473e72"
Expand All @@ -22,8 +22,8 @@ async def test_dumpb_loadb(self):
/ "image1.png"
).read_bytes()
original = await ctx.loadb(image_bytes)
hash_original = hashlib.sha384(
json.dumps(original.flatten().tolist()).encode()
).hexdigest()
hash_original = secure_hash(
json.dumps(original.flatten().tolist()), algorithm="sha384"
)
self.assertEqual(original.shape, (280, 280, 3))
self.assertEqual(hash_original, IMAGE1_HASH)
143 changes: 56 additions & 87 deletions dffml/cli/dataflow.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pathlib
import hashlib
import contextlib
from typing import List, Dict, Any

Expand Down Expand Up @@ -27,6 +26,7 @@
KeysCMDConfig,
)
from ..util.cli.parser import ParseInputsAction
from ..util.crypto import insecure_hash
from ..base import config, field
from ..high_level import run as run_dataflow

Expand Down Expand Up @@ -488,45 +488,37 @@ async def run(self):
# Skip stage if not wanted
if self.stages and stage.value not in self.stages:
continue
stage_node = hashlib.md5(
("stage." + stage.value).encode()
).hexdigest()
stage_node = insecure_hash("stage." + stage.value)
if len(self.stages) != 1:
print(f"subgraph {stage_node}[{stage.value.title()} Stage]")
print(f"style {stage_node} fill:#afd388b5,stroke:#a4ca7a")
for instance_name, operation in dataflow.operations.items():
if operation.stage != stage:
continue
subgraph_node = hashlib.md5(
("subgraph." + instance_name).encode()
).hexdigest()
node = hashlib.md5(instance_name.encode()).hexdigest()
subgraph_node = insecure_hash("subgraph." + instance_name)
node = insecure_hash(instance_name)
if not self.simple:
print(f"subgraph {subgraph_node}[{instance_name}]")
print(f"style {subgraph_node} fill:#fff4de,stroke:#cece71")
print(f"{node}[{operation.instance_name}]")
for input_name in operation.inputs.keys():
input_node = hashlib.md5(
("input." + instance_name + "." + input_name).encode()
).hexdigest()
input_node = insecure_hash(
"input." + instance_name + "." + input_name
)
if not self.simple:
print(f"{input_node}({input_name})")
print(f"{input_node} --> {node}")
for output_name in operation.outputs.keys():
output_node = hashlib.md5(
(
"output." + instance_name + "." + output_name
).encode()
).hexdigest()
output_node = insecure_hash(
"output." + instance_name + "." + output_name
)
if not self.simple:
print(f"{output_node}({output_name})")
print(f"{node} --> {output_node}")
for condition in operation.conditions:
condition_node = hashlib.md5(
(
"condition." + instance_name + "." + condition.name
).encode()
).hexdigest()
condition_node = insecure_hash(
"condition." + instance_name + "." + condition.name
)
if not self.simple:
print(f"{condition_node}{'{' + condition.name + '}'}")
print(f"{condition_node} --> {node}")
Expand All @@ -541,15 +533,15 @@ async def run(self):
operation = dataflow.operations[instance_name]
if self.stages and not operation.stage.value in self.stages:
continue
node = hashlib.md5(instance_name.encode()).hexdigest()
node = insecure_hash(instance_name)
for input_name, sources in input_flow.inputs.items():
for source in sources:
# TODO Put various sources in their own "Inputs" subgraphs
if isinstance(source, str):
input_definition = operation.inputs[input_name]
seed_input_node = hashlib.md5(
(source + "." + input_definition.name).encode()
).hexdigest()
seed_input_node = insecure_hash(
source + "." + input_definition.name
)
print(
f"{seed_input_node}({source}<br>{input_definition.name})"
)
Expand All @@ -558,11 +550,9 @@ async def run(self):
f"style {seed_input_node} fill:#f6dbf9,stroke:#a178ca"
)
if not self.simple:
input_node = hashlib.md5(
(
"input." + instance_name + "." + input_name
).encode()
).hexdigest()
input_node = insecure_hash(
"input." + instance_name + "." + input_name
)
print(f"{seed_input_node} --> {input_node}")
else:
print(f"{seed_input_node} --> {node}")
Expand All @@ -577,9 +567,9 @@ async def run(self):
origin_definition_name = (
origin + "." + definition_name
)
seed_input_node = hashlib.md5(
origin_definition_name.encode()
).hexdigest()
seed_input_node = insecure_hash(
origin_definition_name.encode
)
print(
f"{seed_input_node}({source}<br>{origin_definition_name})"
)
Expand All @@ -588,14 +578,9 @@ async def run(self):
f"style {seed_input_node} fill:#f6dbf9,stroke:#a178ca"
)
if not self.simple:
input_node = hashlib.md5(
(
"input."
+ instance_name
+ "."
+ input_name
).encode()
).hexdigest()
input_node = insecure_hash(
"input." + instance_name + "." + input_name
)
print(f"{seed_input_node} --> {input_node}")
else:
print(f"{seed_input_node} --> {node}")
Expand All @@ -612,67 +597,51 @@ async def run(self):
):
source = source[0]
if not self.simple:
source_output_node = hashlib.md5(
(
"output."
+ ".".join(list(source.items())[0])
).encode()
).hexdigest()
input_node = hashlib.md5(
(
"input." + instance_name + "." + input_name
).encode()
).hexdigest()
source_output_node = insecure_hash(
"output." + ".".join(list(source.items())[0])
)
input_node = insecure_hash(
"input." + instance_name + "." + input_name
)

print(f"{source_output_node} --> {input_node}")
else:
source_operation_node = hashlib.md5(
list(source.keys())[0].encode()
).hexdigest()
source_operation_node = insecure_hash(
list(source.keys())[0]
)
print(f"{source_operation_node} --> {node}")
for i, condition in enumerate(input_flow.conditions):
if isinstance(condition, str):
if not self.simple:
condition_name = operation.conditions[i].name
seed_condition_node = hashlib.md5(
(condition + "." + condition_name).encode()
).hexdigest()
seed_condition_node = insecure_hash(
condition + "." + condition_name
)
print(f"{seed_condition_node}({condition_name})")
seed_dependent_node = hashlib.md5(
(
"condition."
+ instance_name
+ "."
+ condition_name
).encode()
).hexdigest()
seed_dependent_node = insecure_hash(
"condition." + instance_name + "." + condition_name
)
print(
f"{seed_condition_node} --> {seed_dependent_node}"
)
else:
if not self.simple:
dependee_node = hashlib.md5(
(
"output."
+ ".".join(list(condition.items())[0])
).encode()
).hexdigest()
dependent_node = hashlib.md5(
(
"condition."
+ instance_name
+ "."
+ dataflow.operations[
list(condition.keys())[0]
]
.outputs[list(condition.values())[0]]
.name
).encode()
).hexdigest()
dependee_node = insecure_hash(
"output." + ".".join(list(condition.items())[0])
)
dependent_node = insecure_hash(
"condition."
+ instance_name
+ "."
+ dataflow.operations[list(condition.keys())[0]]
.outputs[list(condition.values())[0]]
.name
)
print(f"{dependee_node} --> {dependent_node}")
else:
dependee_operation_node = hashlib.md5(
list(condition.keys())[0].encode()
).hexdigest()
dependee_operation_node = insecure_hash(
list(condition.keys())[0]
)
print(f"{dependee_operation_node} --> {node}")
if len(self.stages) != 1:
print(f"end")
Expand Down
4 changes: 2 additions & 2 deletions dffml/df/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import copy
import asyncio
import secrets
import hashlib
import inspect
import itertools
import traceback
Expand Down Expand Up @@ -76,6 +75,7 @@
from ..util.cli.arg import Arg
from ..util.data import ignore_args
from ..util.asynchelper import aenter_stack, concurrently
from ..util.crypto import secure_hash

from .log import LOGGER

Expand Down Expand Up @@ -859,7 +859,7 @@ def _unique(instance_name: str, handle: str, *uids: str) -> str:
operation.instance_name, and the sorted list of input uuids.
"""
uid_list = [instance_name, handle] + sorted(uids)
return hashlib.sha384("".join(uid_list).encode("utf-8")).hexdigest()
return secure_hash("".join(uid_list), "sha384")

async def unique(
self, operation: Operation, parameter_set: BaseParameterSet
Expand Down
1 change: 0 additions & 1 deletion dffml/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"""
import abc
import json
import hashlib
import pathlib
from typing import AsyncIterator, Optional

Expand Down
20 changes: 20 additions & 0 deletions dffml/util/crypto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""
All hashing originates in this file for easier auditing.
"""
import hashlib


SECURE_HASH_ALGORITHM = hashlib.sha384
INSECURE_HASH_ALGORITHM = hashlib.md5


def secure_hash(string, algorithm) -> str:
if isinstance(string, str):
string = string.encode("utf-8")
return SECURE_HASH_ALGORITHM(string).hexdigest()


def insecure_hash(string) -> str:
if isinstance(string, str):
string = string.encode("utf-8")
return INSECURE_HASH_ALGORITHM(string).hexdigest()
5 changes: 3 additions & 2 deletions dffml/util/file.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
import hashlib
import pathlib
from typing import Union, Tuple

from .crypto import SECURE_HASH_ALGORITHM


class NoHashToUseForValidationSuppliedError(Exception):
"""
Expand Down Expand Up @@ -81,7 +82,7 @@ def validate_file_hash(
filepath = pathlib.Path(filepath)
if expected_sha384_hash is None:
raise NoHashToUseForValidationSuppliedError(filepath)
filehash = hashlib.sha384()
filehash = SECURE_HASH_ALGORITHM()
with open(filepath, "rb") as fileobj:
bytes_read = fileobj.read(chunk_size)
filehash.update(bytes_read)
Expand Down
8 changes: 4 additions & 4 deletions model/scikit/dffml_model_scikit/scikit_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
Base class for Scikit models
"""
import json
import hashlib
import pathlib
import logging
import importlib
Expand All @@ -30,6 +29,7 @@
from dffml.source.source import Sources, SourcesContext
from dffml.model.model import ModelConfig, ModelContext, Model, ModelNotTrained
from dffml.feature.feature import Features, Feature
from dffml.util.crypto import secure_hash


class ScikitConfig(ModelConfig, NamedTuple):
Expand Down Expand Up @@ -64,9 +64,9 @@ def _feature_predict_hash(self):
if k not in ["features", "tcluster", "predict"]
]
)
return hashlib.sha384(
"".join([params] + self.features).encode()
).hexdigest()
return secure_hash(
"".join([params] + self.features), algorithm="sha384"
)

@property
def _filepath(self):
Expand Down
1 change: 0 additions & 1 deletion model/tensorflow/dffml_model_tensorflow/dnnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""
import os
import abc
import hashlib
import inspect
import pathlib

Expand Down
Loading

0 comments on commit 2174a51

Please sign in to comment.