diff --git a/datasets/data/pd_series_HCV.pkl b/datasets/data/pd_series_HCV.pkl new file mode 100644 index 00000000..7846bfab Binary files /dev/null and b/datasets/data/pd_series_HCV.pkl differ diff --git a/docs-source/source/resample.rst b/docs-source/source/resample.rst index a780c784..ae0b3da4 100644 --- a/docs-source/source/resample.rst +++ b/docs-source/source/resample.rst @@ -29,3 +29,11 @@ Reindex * :ref:`sphx_glr_auto_examples_reindex_plot_pearson_correlation.py` +Reindex scatterplot +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autofunction:: indsl.resample.reindex_scatter + +Reindex scatterplot x-values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autofunction:: indsl.resample.reindex_scatter_x + diff --git a/examples/reindex/plot_mock_scatter_plot.py b/examples/reindex/plot_mock_scatter_plot.py new file mode 100644 index 00000000..9d670f99 --- /dev/null +++ b/examples/reindex/plot_mock_scatter_plot.py @@ -0,0 +1,87 @@ +# Copyright 2024 Cognite AS +""" +======================================================= +Re-indexing to mock a scatter plot +======================================================= + +This shows how we can superimpose a scatter plot on an existing chart +""" + +import os + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from datetime import datetime + +from indsl.resample.mock_scatter_plot import reindex_scatter, reindex_scatter_x + +# Load the pressure sensor data +base_path = "" if __name__ == "__main__" else os.path.dirname(__file__) + +# Read in data for a production choke opening +filename = os.path.join(base_path, "../../datasets/data/pd_series_HCV.pkl") +HCV_series = pd.read_pickle(filename) + + +# Creating a mock CV curve using a sine form +n = 20 +x_values = np.linspace(0, 100, n) +y_values = (np.sin(x_values / x_values.max() * np.pi - np.pi * 0.5) + 1) * 0.5 +from scipy.interpolate import interp1d + +# Calculate the CV value for the different choke openings, using the interpolated CV curve +interpolator = interp1d(x_values, y_values) +CV_array = interpolator(HCV_series.values) +# Create the series for the CV value +CV_series = pd.Series(CV_array, index=HCV_series.index) + +# We normalise choke opening such that [0,100] covers the entrie time range +x_min_value = 0 +x_max_value = 100 +scatter_y = reindex_scatter( + HCV_series, CV_series, x_min_value=x_min_value, x_max_value=x_max_value, align_timesteps=True +) +scatter_x = reindex_scatter_x(HCV_series, x_min_value=x_min_value, x_max_value=x_max_value) + + +fig = plt.figure(figsize=(12, 8)) +lns1 = plt.plot(HCV_series.index, HCV_series.values, "-b", label="Choke opening") +axl = plt.gca() +axr = axl.twinx() +lns2 = axr.plot(scatter_y.index, scatter_y.values, ".r", label="CV curve") +lns3 = axl.plot(scatter_x.index, scatter_x, ".g", label="Choke opening") +axl.set_xlabel("Time/choke opening") +axl.set_ylabel("Choke opening [%]") +axr.set_ylabel("CV [-]") + +# Adding both time and choke opening for the x-axis. +xticks_pos = axl.get_xticks() # Get the position of the existing ticks +xtick_labels = axl.get_xticklabels() # Get the date for the existing ticks +xticks_pos_epoc = [ + datetime.strptime(val.get_text(), "%Y-%m-%d").timestamp() for val in xtick_labels +] # Convert the dates to timestamp + +# Need to convert the timestamp to the corresponding choke opening +epoc_start = HCV_series.index[0].timestamp() +epoc_end = HCV_series.index[-1].timestamp() +d_epoc = epoc_end - epoc_start +# The scale of HCV_series is [x_min_value,x_max_value]. We will now map it to the epoc and then convert it to datetime +xtic_labels_hcv = [ + (val - epoc_start) / d_epoc * (x_max_value - x_min_value) for val in xticks_pos_epoc +] # gives us the HCV value for the corresponding points +# Create the tick label consisting of the date, and the choke opening value on a new line +xtick_labels_mod = [ + val1.get_text() + "\n" + "%1.3g" % (min([max([val2, x_min_value]), x_max_value])) + for (val1, val2) in zip(xtick_labels, xtic_labels_hcv) +] +# Finally update the x-tics values +plt.xticks(xticks_pos, xtick_labels_mod) + +# added the lines to the legend +lns = lns1 + lns2 + lns3 +labs = [l.get_label() for l in lns] +plt.legend(lns, labs, loc=4) +plt.xlim([HCV_series.index[0], HCV_series.index[-1]]) +plt.tight_layout() +plt.show() diff --git a/indsl/resample/__init__.py b/indsl/resample/__init__.py index 96170682..24dc7e0c 100644 --- a/indsl/resample/__init__.py +++ b/indsl/resample/__init__.py @@ -1,12 +1,29 @@ # Copyright 2023 Cognite AS from .group_by import group_by_region from .interpolate import interpolate +from .mock_scatter_plot import reindex_scatter, reindex_scatter_x from .reindex import reindex from .resample import resample, resample_to_granularity TOOLBOX_NAME = "Resample" -__all__ = ["interpolate", "resample", "resample_to_granularity", "group_by_region", "reindex"] +__all__ = [ + "interpolate", + "resample", + "resample_to_granularity", + "group_by_region", + "reindex", + "reindex_scatter", + "reindex_scatter_x", +] -__cognite__ = ["interpolate", "resample", "resample_to_granularity", "group_by_region", "reindex"] +__cognite__ = [ + "interpolate", + "resample", + "resample_to_granularity", + "group_by_region", + "reindex", + "reindex_scatter", + "reindex_scatter_x", +] diff --git a/indsl/resample/mock_scatter_plot.py b/indsl/resample/mock_scatter_plot.py new file mode 100644 index 00000000..381c7681 --- /dev/null +++ b/indsl/resample/mock_scatter_plot.py @@ -0,0 +1,111 @@ +# Copyright 2024 Cognite AS +from datetime import datetime +from typing import Tuple + +import numpy as np +import pandas as pd + +from indsl.resample.auto_align import auto_align +from indsl.type_check import check_types + + +@check_types +def __reindex_scatter_core( + signal_x: pd.Series, x_min_value: float = 0, x_max_value: float = 1 +) -> Tuple[np.array, np.array]: + """Reindex scatterplot core. + + It returns a reindexed array of timestamp. The timestamps are creates such that the values from signal_x + are scaled to the range of signal_x.index, and then applied as index. + This is a way of creating a scatterplot inside a chart + + Args: + signal_x: x-value. + The time series where the values are used as the x-value + x_min_value: Minimum x value. + Minimum x value, used to scale the x values. + x_max_value: Maximum x value. + Maximum x value, used to scale the x value. + + Returns: + np.array: Scatter plot index, Scatter plot sort index + """ + # convert timestamps to epoc + epoc = np.array([val.timestamp() for val in signal_x.index]) + d_epoc = epoc[-1] - epoc[0] + # We will now map the values in singla_x to the epoc and then convert it back to datetime + sequence_epoc = signal_x.values / (x_max_value - x_min_value) * d_epoc + index_x_epoc = sequence_epoc + epoc[0] # translate + index_x = np.array([datetime.fromtimestamp(epoc_) for epoc_ in index_x_epoc]) + # create a sort index, such that the order is increasing + index_sort = np.argsort(index_x_epoc) + + return index_x[index_sort], index_sort + + +@check_types +def reindex_scatter( + signal_x: pd.Series, + signal_y: pd.Series, + x_min_value: float = 0, + x_max_value: float = 1, + align_timesteps: bool = False, +) -> pd.Series: + """Reindex scatterplot. + + It returns the values from signal_y with the timestamps as the values from signal_x, + where the timestamps has been scaled to the range of timestamps from signal_x. + The timestamps are sorted in ascending order, and the values are sorted with the same sort-index + + This is a way of creating a scatterplot inside a chart + + Args: + signal_x: x-value. + The time series where the values are used as the x-value + signal_y: y-value. + The time series where the values are used as the y-value + x_min_value: Minimum x value. + Minimum x value, used to scale the x values. + x_max_value: Maximum x value. + Maximum x value, used to scale the x value. + align_timesteps (bool) : Auto-align + Automatically align time stamp of input time series. Default is False. + + Returns: + pandas.Series: Scatter plot + """ + if align_timesteps: + signal_x, signal_y = auto_align([signal_x, signal_y], align_timesteps) + + index_x_sorted, index_sort = __reindex_scatter_core(signal_x, x_min_value=x_min_value, x_max_value=x_max_value) + + signal_scatter = pd.Series(signal_y.array[index_sort], index=index_x_sorted) + + return signal_scatter + + +@check_types +def reindex_scatter_x(signal_x: pd.Series, x_min_value: float = 0, x_max_value: float = 1) -> pd.Series: + """Reindex scatterplot x-values. + + It returns the values from signal_y with the timestamps as the values from signal_x, + where the timestamps has been scaled to the range of timestamps from signal_x. + The timestamps are sorted in ascending order, and the values are sorted with the same sort-index + In effect this is a straight line going from x_min_value to x_max_value over the time range of signal_x + + Args: + signal_x: x-value. + The time series where the values are used as the x-value + x_min_value: Minimum x value. + Minimum x value, used to scale the x values. + x_max_value: Maximum x value. + Maximum x value, used to scale the x value. + + Returns: + pandas.Series: Scatter plot + """ + index_x_sorted, index_sort = __reindex_scatter_core(signal_x, x_min_value=x_min_value, x_max_value=x_max_value) + + signal_scatter_x = pd.Series(signal_x.array[index_sort], index=index_x_sorted) + + return signal_scatter_x diff --git a/tests/resample/test_mock_scatter_plot.py b/tests/resample/test_mock_scatter_plot.py new file mode 100644 index 00000000..0e5ce262 --- /dev/null +++ b/tests/resample/test_mock_scatter_plot.py @@ -0,0 +1,65 @@ +# Copyright 2024 Cognite AS +import random + +from datetime import datetime + +import numpy as np +import pandas as pd +import pytest + +from pandas.testing import assert_series_equal + +from indsl.exceptions import UserTypeError, UserValueError +from indsl.resample.mock_scatter_plot import reindex_scatter, reindex_scatter_x + + +# Test for empty data +@pytest.mark.core +def test_reindex_scatter(): + + HCV_series = pd.read_pickle("./datasets/data/pd_series_HCV.pkl") + # defining the interpolation curve + n = 20 + x_values = np.linspace(0, 100, n) + y_values = (np.sin(x_values / x_values.max() * np.pi - np.pi * 0.5) + 1) * 0.5 + + from scipy.interpolate import interp1d + + interpolator = interp1d(x_values, y_values) + CV_array = interpolator(HCV_series.values) + # Create the series for the CV value + CV_series = pd.Series(CV_array, index=HCV_series.index) + + x_min_value = 0 + x_max_value = 100 + signal_scatter = reindex_scatter( + HCV_series, CV_series, x_min_value=x_min_value, x_max_value=x_max_value, align_timesteps=True + ) + signal_scatter_x = reindex_scatter_x(HCV_series, x_min_value=x_min_value, x_max_value=x_max_value) + + # Calculate separately + # convert timestamps to epoc + epoc = np.array([val.timestamp() for val in HCV_series.index]) + d_epoc = epoc[-1] - epoc[0] + # The scale of HCV_series is [0,100]. We will now map it to the epoc and then convert it to datetime + sequence_epoc = HCV_series.values / 100 * d_epoc + index_cv_epoc = sequence_epoc + epoc[0] # translate + index_cv = np.array([datetime.fromtimestamp(epoc_) for epoc_ in index_cv_epoc]) + # create a sort index, such that the order is increasing + index_sort = np.argsort(index_cv_epoc) + CV_series = pd.Series(CV_array[index_sort], index=index_cv[index_sort]) + CV_series_x = pd.Series(HCV_series.values[index_sort], index=index_cv[index_sort]) + + assert_series_equal(signal_scatter, CV_series) + assert_series_equal(signal_scatter_x, CV_series_x) + + if False: + import pylab as plt + + fig = plt.figure(figsize=(12, 8)) + plt.plot(HCV_series.index, HCV_series.values, "-b") + axl = plt.gca() + axr = axl.twinx() + axr.plot(signal_scatter.index, signal_scatter.values, "xr") + axr.plot(signal_scatter_x.index, signal_scatter_x * 0.01, ".g") + plt.show()