Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: mock scatter plot functionality #279

Merged
merged 6 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added datasets/data/pd_series_HCV.pkl
gunnarstaff marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
8 changes: 8 additions & 0 deletions docs-source/source/resample.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,11 @@ Reindex

* :ref:`sphx_glr_auto_examples_reindex_plot_pearson_correlation.py`

Reindex scatterplot
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: indsl.resample.reindex_scatter

Reindex scatterplot x-values
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: indsl.resample.reindex_scatter_x

87 changes: 87 additions & 0 deletions examples/reindex/plot_mock_scatter_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright 2024 Cognite AS
"""
=======================================================
Re-indexing to mock a scatter plot
=======================================================

This shows how we can superimpose a scatter plot on an existing chart
"""

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

from indsl.resample.mock_scatter_plot import reindex_scatter, reindex_scatter_x

# Load the pressure sensor data
base_path = "" if __name__ == "__main__" else os.path.dirname(__file__)

# Read in data for a production choke opening
filename = os.path.join(base_path, "../../datasets/data/pd_series_HCV.pkl")
HCV_series = pd.read_pickle(filename)


# Creating a mock CV curve using a sine form
n = 20
x_values = np.linspace(0, 100, n)
y_values = (np.sin(x_values / x_values.max() * np.pi - np.pi * 0.5) + 1) * 0.5
from scipy.interpolate import interp1d

# Calculate the CV value for the different choke openings, using the interpolated CV curve
interpolator = interp1d(x_values, y_values)
CV_array = interpolator(HCV_series.values)
# Create the series for the CV value
CV_series = pd.Series(CV_array, index=HCV_series.index)

# We normalise choke opening such that [0,100] covers the entrie time range
x_min_value = 0
x_max_value = 100
scatter_y = reindex_scatter(
HCV_series, CV_series, x_min_value=x_min_value, x_max_value=x_max_value, align_timesteps=True
)
scatter_x = reindex_scatter_x(HCV_series, x_min_value=x_min_value, x_max_value=x_max_value)


fig = plt.figure(figsize=(12, 8))
lns1 = plt.plot(HCV_series.index, HCV_series.values, "-b", label="Choke opening")
axl = plt.gca()
axr = axl.twinx()
lns2 = axr.plot(scatter_y.index, scatter_y.values, ".r", label="CV curve")
lns3 = axl.plot(scatter_x.index, scatter_x, ".g", label="Choke opening")
axl.set_xlabel("Time/choke opening")
axl.set_ylabel("Choke opening [%]")
axr.set_ylabel("CV [-]")

# Adding both time and choke opening for the x-axis.
xticks_pos = axl.get_xticks() # Get the position of the existing ticks
xtick_labels = axl.get_xticklabels() # Get the date for the existing ticks
xticks_pos_epoc = [
datetime.strptime(val.get_text(), "%Y-%m-%d").timestamp() for val in xtick_labels
] # Convert the dates to timestamp

# Need to convert the timestamp to the corresponding choke opening
epoc_start = HCV_series.index[0].timestamp()
epoc_end = HCV_series.index[-1].timestamp()
d_epoc = epoc_end - epoc_start
# The scale of HCV_series is [x_min_value,x_max_value]. We will now map it to the epoc and then convert it to datetime
xtic_labels_hcv = [
(val - epoc_start) / d_epoc * (x_max_value - x_min_value) for val in xticks_pos_epoc
] # gives us the HCV value for the corresponding points
# Create the tick label consisting of the date, and the choke opening value on a new line
xtick_labels_mod = [
val1.get_text() + "\n" + "%1.3g" % (min([max([val2, x_min_value]), x_max_value]))
for (val1, val2) in zip(xtick_labels, xtic_labels_hcv)
]
# Finally update the x-tics values
plt.xticks(xticks_pos, xtick_labels_mod)

# added the lines to the legend
lns = lns1 + lns2 + lns3
labs = [l.get_label() for l in lns]
plt.legend(lns, labs, loc=4)
plt.xlim([HCV_series.index[0], HCV_series.index[-1]])
plt.tight_layout()
plt.show()
21 changes: 19 additions & 2 deletions indsl/resample/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
# Copyright 2023 Cognite AS
from .group_by import group_by_region
from .interpolate import interpolate
from .mock_scatter_plot import reindex_scatter, reindex_scatter_x
from .reindex import reindex
from .resample import resample, resample_to_granularity


TOOLBOX_NAME = "Resample"

__all__ = ["interpolate", "resample", "resample_to_granularity", "group_by_region", "reindex"]
__all__ = [
"interpolate",
"resample",
"resample_to_granularity",
"group_by_region",
"reindex",
"reindex_scatter",
"reindex_scatter_x",
]

__cognite__ = ["interpolate", "resample", "resample_to_granularity", "group_by_region", "reindex"]
__cognite__ = [
"interpolate",
"resample",
"resample_to_granularity",
"group_by_region",
"reindex",
"reindex_scatter",
"reindex_scatter_x",
]
111 changes: 111 additions & 0 deletions indsl/resample/mock_scatter_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright 2024 Cognite AS
from datetime import datetime
from typing import Tuple

import numpy as np
import pandas as pd

from indsl.resample.auto_align import auto_align
from indsl.type_check import check_types


@check_types
def __reindex_scatter_core(
signal_x: pd.Series, x_min_value: float = 0, x_max_value: float = 1
) -> Tuple[np.array, np.array]:
"""Reindex scatterplot core.

It returns a reindexed array of timestamp. The timestamps are creates such that the values from signal_x
are scaled to the range of signal_x.index, and then applied as index.
This is a way of creating a scatterplot inside a chart

Args:
signal_x: x-value.
The time series where the values are used as the x-value
x_min_value: Minimum x value.
Minimum x value, used to scale the x values.
x_max_value: Maximum x value.
Maximum x value, used to scale the x value.

Returns:
np.array: Scatter plot index, Scatter plot sort index
"""
# convert timestamps to epoc
epoc = np.array([val.timestamp() for val in signal_x.index])
d_epoc = epoc[-1] - epoc[0]
# We will now map the values in singla_x to the epoc and then convert it back to datetime
sequence_epoc = signal_x.values / (x_max_value - x_min_value) * d_epoc
index_x_epoc = sequence_epoc + epoc[0] # translate
index_x = np.array([datetime.fromtimestamp(epoc_) for epoc_ in index_x_epoc])
# create a sort index, such that the order is increasing
index_sort = np.argsort(index_x_epoc)

return index_x[index_sort], index_sort


@check_types
def reindex_scatter(
signal_x: pd.Series,
signal_y: pd.Series,
x_min_value: float = 0,
x_max_value: float = 1,
align_timesteps: bool = False,
) -> pd.Series:
"""Reindex scatterplot.

It returns the values from signal_y with the timestamps as the values from signal_x,
where the timestamps has been scaled to the range of timestamps from signal_x.
The timestamps are sorted in ascending order, and the values are sorted with the same sort-index

This is a way of creating a scatterplot inside a chart

Args:
signal_x: x-value.
The time series where the values are used as the x-value
signal_y: y-value.
The time series where the values are used as the y-value
x_min_value: Minimum x value.
Minimum x value, used to scale the x values.
x_max_value: Maximum x value.
Maximum x value, used to scale the x value.
align_timesteps (bool) : Auto-align
Automatically align time stamp of input time series. Default is False.

Returns:
pandas.Series: Scatter plot
"""
if align_timesteps:
signal_x, signal_y = auto_align([signal_x, signal_y], align_timesteps)

index_x_sorted, index_sort = __reindex_scatter_core(signal_x, x_min_value=x_min_value, x_max_value=x_max_value)

signal_scatter = pd.Series(signal_y.array[index_sort], index=index_x_sorted)

return signal_scatter


@check_types
def reindex_scatter_x(signal_x: pd.Series, x_min_value: float = 0, x_max_value: float = 1) -> pd.Series:
"""Reindex scatterplot x-values.

It returns the values from signal_y with the timestamps as the values from signal_x,
where the timestamps has been scaled to the range of timestamps from signal_x.
The timestamps are sorted in ascending order, and the values are sorted with the same sort-index
In effect this is a straight line going from x_min_value to x_max_value over the time range of signal_x

Args:
signal_x: x-value.
The time series where the values are used as the x-value
x_min_value: Minimum x value.
Minimum x value, used to scale the x values.
x_max_value: Maximum x value.
Maximum x value, used to scale the x value.

Returns:
pandas.Series: Scatter plot
"""
index_x_sorted, index_sort = __reindex_scatter_core(signal_x, x_min_value=x_min_value, x_max_value=x_max_value)

signal_scatter_x = pd.Series(signal_x.array[index_sort], index=index_x_sorted)

return signal_scatter_x
65 changes: 65 additions & 0 deletions tests/resample/test_mock_scatter_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2024 Cognite AS
import random

from datetime import datetime

import numpy as np
import pandas as pd
import pytest

from pandas.testing import assert_series_equal

from indsl.exceptions import UserTypeError, UserValueError
from indsl.resample.mock_scatter_plot import reindex_scatter, reindex_scatter_x


# Test for empty data
@pytest.mark.core
def test_reindex_scatter():

HCV_series = pd.read_pickle("./datasets/data/pd_series_HCV.pkl")
# defining the interpolation curve
n = 20
x_values = np.linspace(0, 100, n)
y_values = (np.sin(x_values / x_values.max() * np.pi - np.pi * 0.5) + 1) * 0.5

from scipy.interpolate import interp1d

interpolator = interp1d(x_values, y_values)
CV_array = interpolator(HCV_series.values)
# Create the series for the CV value
CV_series = pd.Series(CV_array, index=HCV_series.index)

x_min_value = 0
x_max_value = 100
signal_scatter = reindex_scatter(
HCV_series, CV_series, x_min_value=x_min_value, x_max_value=x_max_value, align_timesteps=True
)
signal_scatter_x = reindex_scatter_x(HCV_series, x_min_value=x_min_value, x_max_value=x_max_value)

# Calculate separately
# convert timestamps to epoc
epoc = np.array([val.timestamp() for val in HCV_series.index])
d_epoc = epoc[-1] - epoc[0]
# The scale of HCV_series is [0,100]. We will now map it to the epoc and then convert it to datetime
sequence_epoc = HCV_series.values / 100 * d_epoc
index_cv_epoc = sequence_epoc + epoc[0] # translate
index_cv = np.array([datetime.fromtimestamp(epoc_) for epoc_ in index_cv_epoc])
# create a sort index, such that the order is increasing
index_sort = np.argsort(index_cv_epoc)
CV_series = pd.Series(CV_array[index_sort], index=index_cv[index_sort])
CV_series_x = pd.Series(HCV_series.values[index_sort], index=index_cv[index_sort])

assert_series_equal(signal_scatter, CV_series)
assert_series_equal(signal_scatter_x, CV_series_x)

if False:
import pylab as plt

fig = plt.figure(figsize=(12, 8))
plt.plot(HCV_series.index, HCV_series.values, "-b")
axl = plt.gca()
axr = axl.twinx()
axr.plot(signal_scatter.index, signal_scatter.values, "xr")
axr.plot(signal_scatter_x.index, signal_scatter_x * 0.01, ".g")
plt.show()
Loading