lux-org · dorisjlee · Dec 31, 2020 · Sep 18, 2020 · Sep 18, 2020 · Sep 19, 2020
diff --git a/doc/source/guide/FAQ.rst b/doc/source/guide/FAQ.rst
@@ -64,6 +64,23 @@ How do I turn off Lux?
   To display only the Pandas view of the dataframe, print the dataframe by doing :code:`df.to_pandas()`.
   To turn off Lux completely, remove the :code:`import lux` statement and restart your Jupyter notebook.
 
+How do I disable sampling and have Lux visualize the full dataset?
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+  Lux displays a warning saying "Large dataframe detected: Lux is only visualizing a random sample". If you would like to disable sampling, you can run: 
+
+  .. code-block:: python
+
+      lux.config.sampling = False
+
+  Note that if you have already loaded your data in and printed the visualizations, you would need to reinitialize the Dataframe by setting the config before loading in your data, as such:
+
+  .. code-block:: python
+
+      lux.config.sampling = False
+      df = pd.read_csv("...")
+
+  If you want to fine-tune the sampling parameters, you can edit :code:`lux.config.sampling_start` and :code:`lux.config.sampling_cap`. See `this page <https://lux-api.readthedocs.io/en/latest/source/reference/config.html>`_ for more details.
+
 Troubleshooting Tips
 --------------------
 

diff --git a/doc/source/reference/config.rst b/doc/source/reference/config.rst
@@ -44,3 +44,31 @@ If you try to set the default_display to anything other than 'lux' or 'pandas,'
   :align: center
   :alt: Retrieves a single attribute from Lux's Action Manager using its defined id.
 
+Change the sampling parameters of Lux
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To speed up the visualization processing, by default, Lux performs random sampling on datasets with more than 10000 rows. For datasets over 30000 rows, Lux will randomly sample 30000 rows from the dataset.
+
+If we want to change these parameters, we can set the `sampling_start` and `sampling_cap` via `lux.config` to change the default form of output. The `sampling_start` is by default set to 10000 and the `sampling_cap` is by default set to 30000. In the following block, we increase these sampling bounds.
+
+.. code-block:: python
+
+    lux.config.sampling_start = 20000
+    lux.config.sampling_cap = 40000
+
+If we want Lux to use the full dataset in the visualization, we can also disable sampling altogether (but note that this may result in long processing times). Below is an example if disabling the sampling:
+
+.. code-block:: python
+
+    lux.config.sampling = False
+
+Disable the use of heatmaps for large datasets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to sampling, Lux replaces scatter plots with heatmaps for datasets with over 5000 rows to speed up the visualization process.
+
+We can disable this feature and revert back to using a scatter plot by running the following code block (but note that this may result in long processing times).
+
+.. code-block:: python
+
+    lux.config.heatmap = False
diff --git a/lux/_config/config.py b/lux/_config/config.py
@@ -155,6 +155,93 @@ def __init__(self):
         self.plot_config = None
         self.SQLconnection = ""
         self.executor = None
+        self._sampling_start = 10000
+        self._sampling_cap = 30000
+        self._sampling_flag = True
+        self._heatmap_flag = True
+
+    @property
+    def sampling_cap(self):
+        return self._sampling_cap
+
+    @sampling_cap.setter
+    def sampling_cap(self, sample_number: int) -> None:
+        """
+        Parameters
+        ----------
+        sample_number : int
+                Cap on the number of rows to sample. Must be larger than _sampling_start
+        """
+        if type(sample_number) == int:
+            assert sample_number >= self._sampling_start
+            self._sampling_cap = sample_number
+        else:
+            warnings.warn(
+                "The cap on the number samples must be an integer.",
+                stacklevel=2,
+            )
+
+    @property
+    def sampling_start(self):
+        return self._sampling_start
+
+    @sampling_start.setter
+    def sampling_start(self, sample_number: int) -> None:
+        """
+        Parameters
+        ----------
+        sample_number : int
+                Number of rows required to begin sampling. Must be smaller or equal to _sampling_cap
+
+        """
+        if type(sample_number) == int:
+            assert sample_number <= self._sampling_cap
+            self._sampling_start = sample_number
+        else:
+            warnings.warn(
+                "The sampling starting point must be an integer.",
+                stacklevel=2,
+            )
+
+    @property
+    def sampling(self):
+        return self._sampling_flag
+
+    @sampling.setter
+    def sampling(self, sample_flag: bool) -> None:
+        """
+        Parameters
+        ----------
+        sample_flag : bool
+                Whether or not sampling will occur.
+        """
+        if type(sample_flag) == bool:
+            self._sampling_flag = sample_flag
+        else:
+            warnings.warn(
+                "The flag for sampling must be a boolean.",
+                stacklevel=2,
+            )
+
+    @property
+    def heatmap(self):
+        return self._heatmap_flag
+
+    @heatmap.setter
+    def heatmap(self, heatmap_flag: bool) -> None:
+        """
+        Parameters
+        ----------
+        heatmap_flag : bool
+                Whether or not a heatmap will be used instead of a scatter plot.
+        """
+        if type(heatmap_flag) == bool:
+            self._heatmap_flag = heatmap_flag
+        else:
+            warnings.warn(
+                "The flag for enabling/disabling heatmaps must be a boolean.",
+                stacklevel=2,
+            )
 
     @property
     def default_display(self):

diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -40,17 +40,19 @@ def __repr__(self):
     @staticmethod
     def execute_sampling(ldf: LuxDataFrame):
         # General Sampling for entire dataframe
-        SAMPLE_START = 10000
-        SAMPLE_CAP = 30000
+        SAMPLE_FLAG = lux.config.sampling
+        SAMPLE_START = lux.config.sampling_start
+        SAMPLE_CAP = lux.config.sampling_cap
         SAMPLE_FRAC = 0.75
-        if len(ldf) > SAMPLE_CAP:
+
+        if SAMPLE_FLAG and len(ldf) > SAMPLE_CAP:
             if ldf._sampled is None:  # memoize unfiltered sample df
                 ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1)
             ldf._message.add_unique(
                 f"Large dataframe detected: Lux is only visualizing a random sample capped at {SAMPLE_CAP} rows.",
                 priority=99,
             )
-        elif len(ldf) > SAMPLE_START:
+        elif SAMPLE_FLAG and len(ldf) > SAMPLE_START:
             if ldf._sampled is None:  # memoize unfiltered sample df
                 ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1)
             ldf._message.add_unique(
@@ -99,7 +101,7 @@ def execute(vislist: VisList, ldf: LuxDataFrame):
                 PandasExecutor.execute_binning(vis)
             elif vis.mark == "scatter":
                 HBIN_START = 5000
-                if len(ldf) > HBIN_START:
+                if lux.config.heatmap and len(ldf) > HBIN_START:
                     vis._postbin = True
                     ldf._message.add_unique(
                         f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.",

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -196,6 +196,41 @@ def change_color_make_transparent_add_title(chart):
     assert title_addition in exported_code_str
 
 
+def test_sampling_flag_config():
+    df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv")
+    df._repr_html_()
+    assert df.recommendation["Correlation"][0].data.shape[0] == 30000
+    lux.config.sampling = False
+    df = df.copy()
+    df._repr_html_()
+    assert df.recommendation["Correlation"][0].data.shape[0] == 48895
+    lux.config.sampling = True
+
+
+def test_sampling_parameters_config():
+    df = pd.read_csv("lux/data/car.csv")
+    df._repr_html_()
+    assert df.recommendation["Correlation"][0].data.shape[0] == 392
+    lux.config.sampling_start = 50
+    lux.config.sampling_cap = 100
+    df = pd.read_csv("lux/data/car.csv")
+    df._repr_html_()
+    assert df.recommendation["Correlation"][0].data.shape[0] == 100
+    lux.config.sampling_cap = 30000
+    lux.config.sampling_start = 10000
+
+
+def test_heatmap_flag_config():
+    df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv")
+    df._repr_html_()
+    assert df.recommendation["Correlation"][0]._postbin
+    lux.config.heatmap = False
+    df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv")
+    df = df.copy()
+    assert not df.recommendation["Correlation"][0]._postbin
+    lux.config.heatmap = True
+
+
 # TODO: This test does not pass in pytest but is working in Jupyter notebook.
 # def test_plot_setting(global_var):
 # 	df = pytest.car_df