Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Similarity as a default action #182

Merged
merged 7 commits into from
Dec 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions lux/action/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from lux.vis.VisList import VisList
from lux.processor.Compiler import Compiler
from lux.utils import utils
from lux.utils.utils import get_filter_specs


def filter(ldf):
Expand Down Expand Up @@ -112,9 +113,28 @@ def get_complementary_ops(fltr_op):
new_spec.append(new_filter)
temp_vis = Vis(new_spec)
output.append(temp_vis)
if (
ldf.current_vis is not None
and len(ldf.current_vis) == 1
and ldf.current_vis[0].mark == "line"
and len(get_filter_specs(ldf.intent)) > 0
):
recommendation = {
"action": "Similarity",
"description": "Show other charts that are visually similar to the Current vis.",
}
last = get_filter_specs(ldf.intent)[-1]
output = ldf.intent.copy()[0:-1]
# array of possible values for attribute
arr = ldf[last.attribute].unique().tolist()
output.append(lux.Clause(last.attribute, last.attribute, arr))
vlist = lux.vis.VisList.VisList(output, ldf)
for vis in vlist:
vis.score = interestingness(vis, ldf)
vlist_copy = lux.vis.VisList.VisList(output, ldf)
for i in range(len(vlist_copy)):
vlist[i].score = interestingness(vlist_copy[i], ldf)
vlist = vlist.topK(15)
recommendation["collection"] = vlist
if recommendation["action"] == "Similarity":
recommendation["collection"] = vlist[1:]
else:
recommendation["collection"] = vlist
return recommendation
19 changes: 19 additions & 0 deletions lux/interestingness/interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from scipy.spatial.distance import euclidean
import lux
from lux.utils.utils import get_filter_specs
from lux.interestingness.similarity import preprocess, euclidean_dist
from lux.vis.VisList import VisList


def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
Expand Down Expand Up @@ -68,6 +71,22 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
dimension_lst = vis.get_attr_by_data_model("dimension")
measure_lst = vis.get_attr_by_data_model("measure")
v_size = len(vis.data)

if (
n_dim == 1
and (n_msr == 0 or n_msr == 1)
and ldf.current_vis is not None
and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
and len(ldf.current_vis) == 1
and ldf.current_vis[0].mark == "line"
and len(get_filter_specs(ldf.intent)) > 0
):
query_vc = VisList(ldf.current_vis, ldf)
query_vis = query_vc[0]
preprocess(query_vis)
preprocess(vis)
return 1 - euclidean_dist(query_vis, vis)

# Line/Bar Chart
# print("r:", n_record, "m:", n_msr, "d:",n_dim)
if n_dim == 1 and (n_msr == 0 or n_msr == 1):
Expand Down
67 changes: 1 addition & 66 deletions lux/action/similarity.py → lux/interestingness/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,70 +17,7 @@
import math
import numpy as np
from lux.vis.VisList import VisList


def similar_pattern(ldf, intent, topK=-1):
"""
Generates visualizations with similar patterns to a query visualization.

Parameters
----------
ldf : lux.core.frame
LuxDataFrame with underspecified intent.

intent: list[lux.Clause]
intent for specifying the visual query for the similarity search.

topK: int
number of visual recommendations to return.

Returns
-------
recommendations : Dict[str,obj]
object with a collection of visualizations that result from the Similarity action
"""
row_specs = list(filter(lambda x: x.value != "", intent))
if len(row_specs) == 1:
search_space_vc = VisList(ldf.current_vis.collection.copy(), ldf)

query_vc = VisList(intent, ldf)
query_vis = query_vc[0]
preprocess(query_vis)
# for loop to create assign euclidean distance
recommendation = {
"action": "Similarity",
"description": "Show other charts that are visually similar to the Current vis.",
}
for vis in search_space_vc:
preprocess(vis)
vis.score = euclidean_dist(query_vis, vis)
search_space_vc.normalize_score(invert_order=True)
if topK != -1:
search_space_vc = search_space_vc.topK(topK)
recommendation["collection"] = search_space_vc
return recommendation
else:
print("Query needs to have 1 row value")


def aggregate(vis):
"""
Aggregates data values on the y axis so that the vis is a time series

Parameters
----------
vis : lux.vis.Vis
vis that represents the candidate visualization
Returns
-------
None
"""
if vis.get_attr_by_channel("x") and vis.get_attr_by_channel("y"):

xAxis = vis.get_attr_by_channel("x")[0].attribute
yAxis = vis.get_attr_by_channel("y")[0].attribute

vis.data = vis.data[[xAxis, yAxis]].groupby(xAxis, as_index=False).agg({yAxis: "mean"}).copy()
from lux.utils.utils import get_filter_specs


def interpolate(vis, length):
Expand Down Expand Up @@ -204,6 +141,4 @@ def preprocess(vis):
-------
None
"""
aggregate(vis)
interpolate(vis, 100)
normalize(vis)
38 changes: 25 additions & 13 deletions lux/processor/Compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from lux.core.frame import LuxDataFrame
from lux.vis.VisList import VisList
from lux.utils import date_utils
from lux.utils import utils
import pandas as pd
import numpy as np
import warnings
Expand Down Expand Up @@ -179,12 +180,25 @@ def populate_data_type_model(ldf, vlist):
else:
chart_title = clause.value
vis.title = f"{clause.attribute} {clause.filter_op} {chart_title}"
vis._ndim = 0
vis._nmsr = 0

for clause in vis._inferred_intent:
if clause.value == "":
if clause.data_model == "dimension":
vis._ndim += 1
elif clause.data_model == "measure" and clause.attribute != "Record":
vis._nmsr += 1

@staticmethod
def remove_all_invalid(vis_collection: VisList) -> VisList:
"""
Given an expanded vis list, remove all visualizations that are invalid.
Currently, the invalid visualizations are ones that contain two of the same attribute, no more than two temporal attributes, or overlapping attributes (same filter attribute and visualized attribute).
Currently, the invalid visualizations are ones that do not contain:
- two of the same attribute,
- more than two temporal attributes,
- no overlapping attributes (same filter attribute and visualized attribute),
- more than 1 temporal attribute with 2 or more measures
Parameters
----------
vis_collection : list[lux.vis.Vis]
Expand All @@ -203,7 +217,11 @@ def remove_all_invalid(vis_collection: VisList) -> VisList:
if clause.data_type == "temporal":
num_temporal_specs += 1
all_distinct_specs = 0 == len(vis._inferred_intent) - len(attribute_set)
if num_temporal_specs < 2 and all_distinct_specs:
if (
num_temporal_specs < 2
and all_distinct_specs
and not (vis._nmsr == 2 and num_temporal_specs == 1)
):
new_vc.append(vis)
# else:
# warnings.warn("\nThere is more than one duplicate attribute specified in the intent.\nPlease check your intent specification again.")
Expand Down Expand Up @@ -235,17 +253,11 @@ def determine_encoding(ldf: LuxDataFrame, vis: Vis):
https://doi.org/10.1109/TVCG.2007.70594
"""
# Count number of measures and dimensions
ndim = 0
nmsr = 0
filters = []
for clause in vis._inferred_intent:
if clause.value == "":
if clause.data_model == "dimension":
ndim += 1
elif clause.data_model == "measure" and clause.attribute != "Record":
nmsr += 1
else: # preserve to add back to _inferred_intent later
filters.append(clause)
ndim = vis._ndim
nmsr = vis._nmsr
# preserve to add back to _inferred_intent later
filters = utils.get_filter_specs(vis._inferred_intent)

# Helper function (TODO: Move this into utils)
def line_or_bar(ldf, dimension: Clause, measure: Clause):
dim_type = dimension.data_type
Expand Down
45 changes: 45 additions & 0 deletions tests/test_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,48 @@ def test_year_filter_value(global_var):
"T00:00:00.000000000" not in vis.to_Altair()
), "Year filter title contains extraneous string, not displayed as summarized string"
df.clear_intent()


def test_similarity(global_var):
df = pytest.car_df
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
df.set_intent(
[
lux.Clause("Year", channel="x"),
lux.Clause("Displacement", channel="y"),
lux.Clause("Origin=USA"),
]
)
df._repr_html_()
assert len(df.recommendation["Similarity"]) == 2
ranked_list = df.recommendation["Similarity"]

japan_vis = list(
filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Japan", ranked_list)
)[0]
europe_vis = list(
filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Europe", ranked_list)
)[0]
assert japan_vis.score > europe_vis.score
df.clear_intent()


def test_similarity2():
df = pd.read_csv(
"https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/real_estate_tutorial.csv"
)

df["Month"] = pd.to_datetime(df["Month"], format="%m")
df["Year"] = pd.to_datetime(df["Year"], format="%Y")

df.intent = [lux.Clause("Year"), lux.Clause("PctForeclosured"), lux.Clause("City=Crofton")]

ranked_list = df.recommendation["Similarity"]

morrisville_vis = list(
filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Morrisville", ranked_list)
)[0]
watertown_vis = list(
filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Watertown", ranked_list)
)[0]
assert morrisville_vis.score > watertown_vis.score
8 changes: 1 addition & 7 deletions tests/test_interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ def test_interestingness_0_2_0(global_var):
assert interestingness(df.recommendation["Enhance"][0], df) != None
rank1 = -1
rank2 = -1
rank3 = -1
for f in range(0, len(df.recommendation["Enhance"])):
if (
str(df.recommendation["Enhance"][f]._inferred_intent[2].attribute) == "Origin"
Expand All @@ -238,12 +237,7 @@ def test_interestingness_0_2_0(global_var):
and str(df.recommendation["Enhance"][f].mark) == "scatter"
):
rank2 = f
if (
str(df.recommendation["Enhance"][f]._inferred_intent[2].attribute) == "Year"
and str(df.recommendation["Enhance"][f].mark) == "scatter"
):
rank3 = f
assert rank1 < rank2 and rank1 < rank3 and rank2 < rank3
assert rank1 < rank2
caitlynachen marked this conversation as resolved.
Show resolved Hide resolved

# check that top recommended filter graph score is not none and that ordering makes intuitive sense
assert interestingness(df.recommendation["Filter"][0], df) != None
Expand Down