From 7af4895a3432c48f444db458459e1f1d7ca04f48 Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Tue, 22 Dec 2020 15:20:55 -0800 Subject: [PATCH 1/7] similarity formatting fixed --- lux/action/similarity.py | 28 ++++++++++++++++++---------- lux/core/frame.py | 10 +++++++++- tests/test_action.py | 20 ++++++++++++++++++++ 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/lux/action/similarity.py b/lux/action/similarity.py index 174a4d43..963a4ea6 100644 --- a/lux/action/similarity.py +++ b/lux/action/similarity.py @@ -17,9 +17,10 @@ import math import numpy as np from lux.vis.VisList import VisList +from lux.utils.utils import get_filter_specs -def similar_pattern(ldf, intent, topK=-1): +def similar_pattern(ldf, *args): """ Generates visualizations with similar patterns to a query visualization. @@ -28,10 +29,7 @@ def similar_pattern(ldf, intent, topK=-1): ldf : lux.core.frame LuxDataFrame with underspecified intent. - intent: list[lux.Clause] - intent for specifying the visual query for the similarity search. - - topK: int + args: topK: int number of visual recommendations to return. Returns @@ -39,11 +37,23 @@ def similar_pattern(ldf, intent, topK=-1): recommendations : Dict[str,obj] object with a collection of visualizations that result from the Similarity action """ - row_specs = list(filter(lambda x: x.value != "", intent)) + + last = get_filter_specs(ldf.intent)[-1] + query = ldf.intent.copy()[0:-1] + # array of possible values for attribute + arr = ldf[last.attribute].unique().tolist() + query.append(lux.Clause(last.attribute, last.attribute, arr)) + row_specs = ldf.intent + if len(args) == 0: + topK = 15 + else: + topK = args[0][0] + + row_specs = list(filter(lambda x: x.value != "", row_specs)) if len(row_specs) == 1: - search_space_vc = VisList(ldf.current_vis.collection.copy(), ldf) + search_space_vc = VisList(query, ldf) - query_vc = VisList(intent, ldf) + query_vc = VisList(ldf.current_vis, ldf) query_vis = query_vc[0] preprocess(query_vis) # for loop to create assign euclidean distance @@ -204,6 +214,4 @@ def preprocess(vis): ------- None """ - aggregate(vis) - interpolate(vis, 100) normalize(vis) diff --git a/lux/core/frame.py b/lux/core/frame.py index 7eab2ed1..dcd86928 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -19,7 +19,7 @@ from lux.vis.VisList import VisList from lux.history.history import History from lux.utils.message import Message -from lux.utils.utils import check_import_lux_widget +from lux.utils.utils import check_import_lux_widget, get_filter_specs from typing import Dict, Union, List, Callable import warnings import traceback @@ -436,6 +436,7 @@ def maintain_recs(self): from lux.action.enhance import enhance from lux.action.filter import filter from lux.action.generalize import generalize + from lux.action.similarity import similar_pattern from lux.action.row_group import row_group from lux.action.column_group import column_group @@ -452,6 +453,12 @@ def maintain_recs(self): one_current_vis = ( lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) == 1 ) + one_current_vis_similarity = ( + lambda ldf: ldf.current_vis is not None + and len(ldf.current_vis) == 1 + and ldf.current_vis[0].mark == "line" + and len(get_filter_specs(ldf.intent)) > 0 + ) multiple_current_vis = ( lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) > 1 ) @@ -465,6 +472,7 @@ def maintain_recs(self): lux.register_action("Enhance", enhance, one_current_vis) lux.register_action("Filter", filter, one_current_vis) lux.register_action("Generalize", generalize, one_current_vis) + lux.register_action("Similarity", similar_pattern, one_current_vis_similarity, 15) lux.register_action("Custom", custom, multiple_current_vis) diff --git a/tests/test_action.py b/tests/test_action.py index f3f35a21..59508eae 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -197,3 +197,23 @@ def test_year_filter_value(global_var): "T00:00:00.000000000" not in vis.to_Altair() ), "Year filter title contains extraneous string, not displayed as summarized string" df.clear_intent() + + +def test_similarity(global_var): + df = pytest.car_df + df["Year"] = pd.to_datetime(df["Year"], format="%Y") + df.set_intent( + [ + lux.Clause("Year", channel="x"), + lux.Clause("Displacement", channel="y"), + lux.Clause("Origin=USA"), + ] + ) + df._repr_html_() + assert len(df.recommendation["Similarity"]) == 3 + assert ( + df.recommendation["Similarity"][0].score + > df.recommendation["Similarity"][1].score + > df.recommendation["Similarity"][2].score + ) + df.clear_intent() From 124d780153977143caa50d2b9a1158ad3a8c0ea1 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Wed, 23 Dec 2020 17:33:41 +0800 Subject: [PATCH 2/7] added another similarity test case; fixed bug where colored heatmap dimension is temporal (invalidate all 2 msr 1 temporal case) --- lux/processor/Compiler.py | 38 +++++++++++++++++++++++------------ tests/test_action.py | 21 +++++++++++++++++++ tests/test_interestingness.py | 8 +------- 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/lux/processor/Compiler.py b/lux/processor/Compiler.py index b07f52ce..1f155197 100644 --- a/lux/processor/Compiler.py +++ b/lux/processor/Compiler.py @@ -18,6 +18,7 @@ from lux.core.frame import LuxDataFrame from lux.vis.VisList import VisList from lux.utils import date_utils +from lux.utils import utils import pandas as pd import numpy as np import warnings @@ -179,12 +180,25 @@ def populate_data_type_model(ldf, vlist): else: chart_title = clause.value vis.title = f"{clause.attribute} {clause.filter_op} {chart_title}" + vis._ndim = 0 + vis._nmsr = 0 + + for clause in vis._inferred_intent: + if clause.value == "": + if clause.data_model == "dimension": + vis._ndim += 1 + elif clause.data_model == "measure" and clause.attribute != "Record": + vis._nmsr += 1 @staticmethod def remove_all_invalid(vis_collection: VisList) -> VisList: """ Given an expanded vis list, remove all visualizations that are invalid. - Currently, the invalid visualizations are ones that contain two of the same attribute, no more than two temporal attributes, or overlapping attributes (same filter attribute and visualized attribute). + Currently, the invalid visualizations are ones that do not contain: + - two of the same attribute, + - more than two temporal attributes, + - no overlapping attributes (same filter attribute and visualized attribute), + - more than 1 temporal attribute with 2 or more measures Parameters ---------- vis_collection : list[lux.vis.Vis] @@ -203,7 +217,11 @@ def remove_all_invalid(vis_collection: VisList) -> VisList: if clause.data_type == "temporal": num_temporal_specs += 1 all_distinct_specs = 0 == len(vis._inferred_intent) - len(attribute_set) - if num_temporal_specs < 2 and all_distinct_specs: + if ( + num_temporal_specs < 2 + and all_distinct_specs + and not (vis._nmsr == 2 and num_temporal_specs == 1) + ): new_vc.append(vis) # else: # warnings.warn("\nThere is more than one duplicate attribute specified in the intent.\nPlease check your intent specification again.") @@ -235,17 +253,11 @@ def determine_encoding(ldf: LuxDataFrame, vis: Vis): https://doi.org/10.1109/TVCG.2007.70594 """ # Count number of measures and dimensions - ndim = 0 - nmsr = 0 - filters = [] - for clause in vis._inferred_intent: - if clause.value == "": - if clause.data_model == "dimension": - ndim += 1 - elif clause.data_model == "measure" and clause.attribute != "Record": - nmsr += 1 - else: # preserve to add back to _inferred_intent later - filters.append(clause) + ndim = vis._ndim + nmsr = vis._nmsr + # preserve to add back to _inferred_intent later + filters = utils.get_filter_specs(vis._inferred_intent) + # Helper function (TODO: Move this into utils) def line_or_bar(ldf, dimension: Clause, measure: Clause): dim_type = dimension.data_type diff --git a/tests/test_action.py b/tests/test_action.py index 59508eae..d32f9880 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -217,3 +217,24 @@ def test_similarity(global_var): > df.recommendation["Similarity"][2].score ) df.clear_intent() + + +def test_similarity2(): + df = pd.read_csv( + "https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/real_estate_tutorial.csv" + ) + + df["Month"] = pd.to_datetime(df["Month"], format="%m") + df["Year"] = pd.to_datetime(df["Year"], format="%Y") + + df.intent = [lux.Clause("Year"), lux.Clause("PctForeclosured"), lux.Clause("City=Crofton")] + + ranked_list = df.recommendation["Similarity"] + + morrisville_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Morrisville", ranked_list) + )[0] + watertown_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Watertown", ranked_list) + )[0] + assert morrisville_vis.score > watertown_vis.score diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py index a13ee406..15b8e74d 100644 --- a/tests/test_interestingness.py +++ b/tests/test_interestingness.py @@ -226,7 +226,6 @@ def test_interestingness_0_2_0(global_var): assert interestingness(df.recommendation["Enhance"][0], df) != None rank1 = -1 rank2 = -1 - rank3 = -1 for f in range(0, len(df.recommendation["Enhance"])): if ( str(df.recommendation["Enhance"][f]._inferred_intent[2].attribute) == "Origin" @@ -238,12 +237,7 @@ def test_interestingness_0_2_0(global_var): and str(df.recommendation["Enhance"][f].mark) == "scatter" ): rank2 = f - if ( - str(df.recommendation["Enhance"][f]._inferred_intent[2].attribute) == "Year" - and str(df.recommendation["Enhance"][f].mark) == "scatter" - ): - rank3 = f - assert rank1 < rank2 and rank1 < rank3 and rank2 < rank3 + assert rank1 < rank2 # check that top recommended filter graph score is not none and that ordering makes intuitive sense assert interestingness(df.recommendation["Filter"][0], df) != None From 96feed4ff614ed8572f25613524e7d5c83d98e1a Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Fri, 25 Dec 2020 14:40:19 -0800 Subject: [PATCH 3/7] filter and similarity together --- lux/action/filter.py | 26 +++++++++++++++++++++++--- lux/action/similarity.py | 9 +++++---- lux/core/frame.py | 8 -------- lux/interestingness/interestingness.py | 19 +++++++++++++++++++ tests/test_action.py | 16 ++++++++++------ 5 files changed, 57 insertions(+), 21 deletions(-) diff --git a/lux/action/filter.py b/lux/action/filter.py index af9a495b..70d85e0e 100644 --- a/lux/action/filter.py +++ b/lux/action/filter.py @@ -18,6 +18,7 @@ from lux.vis.VisList import VisList from lux.processor.Compiler import Compiler from lux.utils import utils +from lux.utils.utils import get_filter_specs def filter(ldf): @@ -112,9 +113,28 @@ def get_complementary_ops(fltr_op): new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) + if ( + ldf.current_vis is not None + and len(ldf.current_vis) == 1 + and ldf.current_vis[0].mark == "line" + and len(get_filter_specs(ldf.intent)) > 0 + ): + recommendation = { + "action": "Similarity", + "description": "Show other charts that are visually similar to the Current vis.", + } + last = get_filter_specs(ldf.intent)[-1] + output = ldf.intent.copy()[0:-1] + # array of possible values for attribute + arr = ldf[last.attribute].unique().tolist() + output.append(lux.Clause(last.attribute, last.attribute, arr)) vlist = lux.vis.VisList.VisList(output, ldf) - for vis in vlist: - vis.score = interestingness(vis, ldf) + vlist_copy = lux.vis.VisList.VisList(output, ldf) + for i in range(len(vlist_copy)): + vlist[i].score = interestingness(vlist_copy[i], ldf) vlist = vlist.topK(15) - recommendation["collection"] = vlist + if recommendation["action"] == "Similarity": + recommendation["collection"] = vlist[1:] + else: + recommendation["collection"] = vlist return recommendation diff --git a/lux/action/similarity.py b/lux/action/similarity.py index 963a4ea6..06f18ca5 100644 --- a/lux/action/similarity.py +++ b/lux/action/similarity.py @@ -61,13 +61,14 @@ def similar_pattern(ldf, *args): "action": "Similarity", "description": "Show other charts that are visually similar to the Current vis.", } - for vis in search_space_vc: - preprocess(vis) - vis.score = euclidean_dist(query_vis, vis) + search_space_vc_copy = VisList(query, ldf) + for i in range(len(search_space_vc_copy)): + preprocess(search_space_vc_copy[i]) + search_space_vc[i].score = euclidean_dist(query_vis, search_space_vc_copy[i]) search_space_vc.normalize_score(invert_order=True) if topK != -1: search_space_vc = search_space_vc.topK(topK) - recommendation["collection"] = search_space_vc + recommendation["collection"] = search_space_vc[1:] return recommendation else: print("Query needs to have 1 row value") diff --git a/lux/core/frame.py b/lux/core/frame.py index dcd86928..86d288f0 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -436,7 +436,6 @@ def maintain_recs(self): from lux.action.enhance import enhance from lux.action.filter import filter from lux.action.generalize import generalize - from lux.action.similarity import similar_pattern from lux.action.row_group import row_group from lux.action.column_group import column_group @@ -453,12 +452,6 @@ def maintain_recs(self): one_current_vis = ( lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) == 1 ) - one_current_vis_similarity = ( - lambda ldf: ldf.current_vis is not None - and len(ldf.current_vis) == 1 - and ldf.current_vis[0].mark == "line" - and len(get_filter_specs(ldf.intent)) > 0 - ) multiple_current_vis = ( lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) > 1 ) @@ -472,7 +465,6 @@ def maintain_recs(self): lux.register_action("Enhance", enhance, one_current_vis) lux.register_action("Filter", filter, one_current_vis) lux.register_action("Generalize", generalize, one_current_vis) - lux.register_action("Similarity", similar_pattern, one_current_vis_similarity, 15) lux.register_action("Custom", custom, multiple_current_vis) diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index 0c94757e..148e1f6d 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -22,6 +22,9 @@ from pandas.api.types import is_datetime64_any_dtype as is_datetime from scipy.spatial.distance import euclidean import lux +from lux.utils.utils import get_filter_specs +from lux.action.similarity import preprocess, euclidean_dist +from lux.vis.VisList import VisList def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: @@ -68,6 +71,22 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) + + if ( + n_dim == 1 + and (n_msr == 0 or n_msr == 1) + and ldf.current_vis is not None + and vis.get_attr_by_channel("y")[0].data_type == "quantitative" + and len(ldf.current_vis) == 1 + and ldf.current_vis[0].mark == "line" + and len(get_filter_specs(ldf.intent)) > 0 + ): + query_vc = VisList(ldf.current_vis, ldf) + query_vis = query_vc[0] + preprocess(query_vis) + preprocess(vis) + return -1 * euclidean_dist(query_vis, vis) + # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): diff --git a/tests/test_action.py b/tests/test_action.py index d32f9880..bbe161e0 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -210,12 +210,16 @@ def test_similarity(global_var): ] ) df._repr_html_() - assert len(df.recommendation["Similarity"]) == 3 - assert ( - df.recommendation["Similarity"][0].score - > df.recommendation["Similarity"][1].score - > df.recommendation["Similarity"][2].score - ) + assert len(df.recommendation["Similarity"]) == 2 + ranked_list = df.recommendation["Similarity"] + + japan_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Japan", ranked_list) + )[0] + europe_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Europe", ranked_list) + )[0] + assert japan_vis.score > europe_vis.score df.clear_intent() From ce2209c163b7cb86d339008016997b7e2f3a5ac8 Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Fri, 25 Dec 2020 14:40:37 -0800 Subject: [PATCH 4/7] filter and similarity together --- lux/action/similarity.py | 55 ---------------------------------------- 1 file changed, 55 deletions(-) diff --git a/lux/action/similarity.py b/lux/action/similarity.py index 06f18ca5..4e63b770 100644 --- a/lux/action/similarity.py +++ b/lux/action/similarity.py @@ -19,61 +19,6 @@ from lux.vis.VisList import VisList from lux.utils.utils import get_filter_specs - -def similar_pattern(ldf, *args): - """ - Generates visualizations with similar patterns to a query visualization. - - Parameters - ---------- - ldf : lux.core.frame - LuxDataFrame with underspecified intent. - - args: topK: int - number of visual recommendations to return. - - Returns - ------- - recommendations : Dict[str,obj] - object with a collection of visualizations that result from the Similarity action - """ - - last = get_filter_specs(ldf.intent)[-1] - query = ldf.intent.copy()[0:-1] - # array of possible values for attribute - arr = ldf[last.attribute].unique().tolist() - query.append(lux.Clause(last.attribute, last.attribute, arr)) - row_specs = ldf.intent - if len(args) == 0: - topK = 15 - else: - topK = args[0][0] - - row_specs = list(filter(lambda x: x.value != "", row_specs)) - if len(row_specs) == 1: - search_space_vc = VisList(query, ldf) - - query_vc = VisList(ldf.current_vis, ldf) - query_vis = query_vc[0] - preprocess(query_vis) - # for loop to create assign euclidean distance - recommendation = { - "action": "Similarity", - "description": "Show other charts that are visually similar to the Current vis.", - } - search_space_vc_copy = VisList(query, ldf) - for i in range(len(search_space_vc_copy)): - preprocess(search_space_vc_copy[i]) - search_space_vc[i].score = euclidean_dist(query_vis, search_space_vc_copy[i]) - search_space_vc.normalize_score(invert_order=True) - if topK != -1: - search_space_vc = search_space_vc.topK(topK) - recommendation["collection"] = search_space_vc[1:] - return recommendation - else: - print("Query needs to have 1 row value") - - def aggregate(vis): """ Aggregates data values on the y axis so that the vis is a time series From 96cc20ea6e3e8368e2cd67589f191b3b4ce0c8b7 Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Fri, 25 Dec 2020 14:42:31 -0800 Subject: [PATCH 5/7] remove filter --- lux/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lux/core/frame.py b/lux/core/frame.py index 86d288f0..7eab2ed1 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -19,7 +19,7 @@ from lux.vis.VisList import VisList from lux.history.history import History from lux.utils.message import Message -from lux.utils.utils import check_import_lux_widget, get_filter_specs +from lux.utils.utils import check_import_lux_widget from typing import Dict, Union, List, Callable import warnings import traceback From 5352a00e8bf67cdc0bb7342a9cae64503cbfcd1d Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Fri, 25 Dec 2020 14:50:39 -0800 Subject: [PATCH 6/7] black line length --- lux/action/similarity.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lux/action/similarity.py b/lux/action/similarity.py index 4e63b770..8bcbb85e 100644 --- a/lux/action/similarity.py +++ b/lux/action/similarity.py @@ -19,6 +19,7 @@ from lux.vis.VisList import VisList from lux.utils.utils import get_filter_specs + def aggregate(vis): """ Aggregates data values on the y axis so that the vis is a time series From 2ff809819c58561a510fce5b36f8f76719f08f27 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Sat, 26 Dec 2020 22:42:32 +0800 Subject: [PATCH 7/7] file reorg and clean; change sim metric --- lux/interestingness/interestingness.py | 4 ++-- lux/{action => interestingness}/similarity.py | 20 ------------------- 2 files changed, 2 insertions(+), 22 deletions(-) rename lux/{action => interestingness}/similarity.py (88%) diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index 148e1f6d..dd9615a6 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -23,7 +23,7 @@ from scipy.spatial.distance import euclidean import lux from lux.utils.utils import get_filter_specs -from lux.action.similarity import preprocess, euclidean_dist +from lux.interestingness.similarity import preprocess, euclidean_dist from lux.vis.VisList import VisList @@ -85,7 +85,7 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: query_vis = query_vc[0] preprocess(query_vis) preprocess(vis) - return -1 * euclidean_dist(query_vis, vis) + return 1 - euclidean_dist(query_vis, vis) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) diff --git a/lux/action/similarity.py b/lux/interestingness/similarity.py similarity index 88% rename from lux/action/similarity.py rename to lux/interestingness/similarity.py index 8bcbb85e..8d810909 100644 --- a/lux/action/similarity.py +++ b/lux/interestingness/similarity.py @@ -20,26 +20,6 @@ from lux.utils.utils import get_filter_specs -def aggregate(vis): - """ - Aggregates data values on the y axis so that the vis is a time series - - Parameters - ---------- - vis : lux.vis.Vis - vis that represents the candidate visualization - Returns - ------- - None - """ - if vis.get_attr_by_channel("x") and vis.get_attr_by_channel("y"): - - xAxis = vis.get_attr_by_channel("x")[0].attribute - yAxis = vis.get_attr_by_channel("y")[0].attribute - - vis.data = vis.data[[xAxis, yAxis]].groupby(xAxis, as_index=False).agg({yAxis: "mean"}).copy() - - def interpolate(vis, length): """ Interpolates the vis data so that the number of data points is fixed to a constant