apache · mistercrunch · Dec 11, 2018 · Jun 4, 2018 · Sep 2, 2018 · Sep 13, 2018
diff --git a/.gitignore b/.gitignore
@@ -44,3 +44,12 @@ yarn-error.log
 *.iml
 venv
 @eaDir/
+
+# Test data
+celery_results.sqlite
+celerybeat-schedule
+celerydb.sqlite
+celerybeat.pid
+geckodriver.log
+ghostdriver.log
+testCSV.csv
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -88,7 +88,7 @@ It's easy: use the ``Filter Box`` widget, build a slice, and add it to your
 dashboard.
 
 The ``Filter Box`` widget allows you to define a query to populate dropdowns
-that can be use for filtering. To build the list of distinct values, we
+that can be used for filtering. To build the list of distinct values, we
 run a query, and sort the result by the metric you provide, sorting
 descending.
 

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -591,14 +591,12 @@ Upgrading should be as straightforward as running::
     superset db upgrade
     superset init
 
-SQL Lab
--------
-SQL Lab is a powerful SQL IDE that works with all SQLAlchemy compatible
-databases. By default, queries are executed in the scope of a web
-request so they
-may eventually timeout as queries exceed the maximum duration of a web
-request in your environment, whether it'd be a reverse proxy or the Superset
-server itself.
+Celery Tasks
+------------
+On large analytic databases, it's common to run background jobs, reports
+and/or queries that execute for minutes or hours. In certain cases, we need
+to support long running tasks that execute beyond the typical web request's
+timeout (30-60 seconds).
 
 On large analytic databases, it's common to run queries that
 execute for minutes or hours.
@@ -622,15 +620,41 @@ have the same configuration.
 
     class CeleryConfig(object):
         BROKER_URL = 'redis://localhost:6379/0'
-        CELERY_IMPORTS = ('superset.sql_lab', )
+        CELERY_IMPORTS = (
+            'superset.sql_lab',
+            'superset.tasks',
+        )
         CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
-        CELERY_ANNOTATIONS = {'tasks.add': {'rate_limit': '10/s'}}
+        CELERYD_LOG_LEVEL = 'DEBUG'
+        CELERYD_PREFETCH_MULTIPLIER = 10
+        CELERY_ACKS_LATE = True
+        CELERY_ANNOTATIONS = {
+            'sql_lab.get_sql_results': {
+                'rate_limit': '100/s',
+            },
+            'email_reports.send': {
+                'rate_limit': '1/s',
+                'time_limit': 120,
+                'soft_time_limit': 150,
+                'ignore_result': True,
+            },
+        }
+        CELERYBEAT_SCHEDULE = {
+            'email_reports.schedule_hourly': {
+                'task': 'email_reports.schedule_hourly',
+                'schedule': crontab(minute=1, hour='*'),
+            },
+        }
 
     CELERY_CONFIG = CeleryConfig
 
-To start a Celery worker to leverage the configuration run: ::
+* To start a Celery worker to leverage the configuration run: ::
+
+    celery worker --app=superset.tasks.celery_app:app --pool=prefork -Ofair -c 4
 
-    celery worker --app=superset.sql_lab:celery_app --pool=gevent -Ofair
+* To start a job which schedules periodic background jobs, run ::
+
+    celery beat --app=superset.tasks.celery_app:app
 
 To setup a result backend, you need to pass an instance of a derivative
 of ``werkzeug.contrib.cache.BaseCache`` to the ``RESULTS_BACKEND``
@@ -653,11 +677,65 @@ look something like:
     RESULTS_BACKEND = RedisCache(
         host='localhost', port=6379, key_prefix='superset_results')
 
-Note that it's important that all the worker nodes and web servers in
-the Superset cluster share a common metadata database.
-This means that SQLite will not work in this context since it has
-limited support for concurrency and
-typically lives on the local file system.
+**Important notes**
+
+* It is important that all the worker nodes and web servers in
+  the Superset cluster share a common metadata database.
+  This means that SQLite will not work in this context since it has
+  limited support for concurrency and
+  typically lives on the local file system.
+
+* There should only be one instance of ``celery beat`` running in your
+  entire setup. If not, background jobs can get scheduled multiple times
+  resulting in weird behaviors like duplicate delivery of reports,
+  higher than expected load / traffic etc.
+
+
+Email Reports
+-------------
+Email reports allow users to schedule email reports for
+
+* slice and dashboard visualization (Attachment or inline)
+* slice data (CSV attachment on inline table)
+
+Schedules are defined in crontab format and each schedule
+can have a list of recipients (all of them can receive a single mail,
+or separate mails). For audit purposes, all outgoing mails can have a
+mandatory bcc.
+
+**Requirements**
+
+* A selenium compatible driver & headless browser
+
+  * `geckodriver <https://github.com/mozilla/geckodriver>`_ and Firefox is preferred
+  * `chromedriver <http://chromedriver.chromium.org/>`_ is a good option too
+* Run `celery worker` and `celery beat` as follows ::
+
+    celery worker --app=superset.tasks.celery_app:app --pool=prefork -Ofair -c 4
+    celery beat --app=superset.tasks.celery_app:app
+
+**Important notes**
+
+* Be mindful of the concurrency setting for celery (using ``-c 4``).
+  Selenium/webdriver instances can consume a lot of CPU / memory on your servers.
+
+* In some cases, if you notice a lot of leaked ``geckodriver`` processes, try running
+  your celery processes with ::
+
+    celery worker --pool=prefork --max-tasks-per-child=128 ...
+
+* It is recommended to run separate workers for ``sql_lab`` and
+  ``email_reports`` tasks. Can be done by using ``queue`` field in ``CELERY_ANNOTATIONS``
+
+SQL Lab
+-------
+SQL Lab is a powerful SQL IDE that works with all SQLAlchemy compatible
+databases. By default, queries are executed in the scope of a web
+request so they may eventually timeout as queries exceed the maximum duration of a web
+request in your environment, whether it'd be a reverse proxy or the Superset
+server itself. In such cases, it is preferred to use ``celery`` to run the queries
+in the background. Please follow the examples/notes mentioned above to get your
+celery setup working.
 
 Also note that SQL Lab supports Jinja templating in queries and that it's
 possible to overload
@@ -672,14 +750,16 @@ in this dictionary are made available for users to use in their SQL.
     }
 
 
+Celery Flower
+-------------
 Flower is a web based tool for monitoring the Celery cluster which you can
 install from pip: ::
 
     pip install flower
 
 and run via: ::
 
-    celery flower --app=superset.sql_lab:celery_app
+    celery flower --app=superset.tasks.celery_app:app
 
 Building from source
 ---------------------

diff --git a/requirements.txt b/requirements.txt
@@ -20,6 +20,7 @@ chardet==3.0.4            # via requests
 click==6.7
 colorama==0.3.9
 contextlib2==0.5.5
+croniter==0.3.26
 cryptography==1.9
 defusedxml==0.5.0         # via python3-openid
 docutils==0.14            # via botocore
@@ -69,9 +70,11 @@ python3-openid==3.1.0     # via flask-openid
 pytz==2018.5              # via babel, celery, flower, pandas
 pyyaml==3.13
 requests==2.18.4
+retry==0.9.2
 rfc3986==1.1.0            # via tableschema
 s3transfer==0.1.13        # via boto3
 sasl==0.2.1               # via thrift-sasl
+selenium==3.141.0
 simplejson==3.15.0
 six==1.11.0               # via bleach, cryptography, isodate, jsonlines, linear-tsv, pathlib2, polyline, pydruid, python-dateutil, sasl, sqlalchemy-utils, tableschema, tabulator, thrift
 sqlalchemy-utils==0.32.21

diff --git a/setup.py b/setup.py
@@ -93,6 +93,9 @@ def get_git_sha():
         'thrift-sasl>=0.2.1',
         'unicodecsv',
         'unidecode>=0.04.21',
+        'croniter==0.3.25',
+        'selenium==3.14.0',
+        'retry==0.9.2',
     ],
     extras_require={
         'cors': ['flask-cors>=2.0.0'],

diff --git a/superset/config.py b/superset/config.py
@@ -11,6 +11,7 @@
 import os
 import sys
 
+from celery.schedules import crontab
 from dateutil import tz
 from flask_appbuilder.security.manager import AUTH_DB
 
@@ -296,19 +297,43 @@
 # Default celery config is to use SQLA as a broker, in a production setting
 # you'll want to use a proper broker as specified here:
 # http://docs.celeryproject.org/en/latest/getting-started/brokers/index.html
-"""
-# Example:
+
+
 class CeleryConfig(object):
-  BROKER_URL = 'sqla+sqlite:///celerydb.sqlite'
-  CELERY_IMPORTS = ('superset.sql_lab', )
-  CELERY_RESULT_BACKEND = 'db+sqlite:///celery_results.sqlite'
-  CELERY_ANNOTATIONS = {'tasks.add': {'rate_limit': '10/s'}}
-  CELERYD_LOG_LEVEL = 'DEBUG'
-  CELERYD_PREFETCH_MULTIPLIER = 1
-  CELERY_ACKS_LATE = True
+    BROKER_URL = 'sqla+sqlite:///celerydb.sqlite'
+    CELERY_IMPORTS = (
+        'superset.sql_lab',
+        'superset.tasks',
+    )
+    CELERY_RESULT_BACKEND = 'db+sqlite:///celery_results.sqlite'
+    CELERYD_LOG_LEVEL = 'DEBUG'
+    CELERYD_PREFETCH_MULTIPLIER = 1
+    CELERY_ACKS_LATE = True
+    CELERY_ANNOTATIONS = {
+        'sql_lab.get_sql_results': {
+            'rate_limit': '100/s',
+        },
+        'email_reports.send': {
+            'rate_limit': '1/s',
+            'time_limit': 120,
+            'soft_time_limit': 150,
+            'ignore_result': True,
+        },
+    }
+    CELERYBEAT_SCHEDULE = {
+        'email_reports.schedule_hourly': {
+            'task': 'email_reports.schedule_hourly',
+            'schedule': crontab(minute=1, hour='*'),
+        },
+    }
+
+
 CELERY_CONFIG = CeleryConfig
+
 """
+# Set celery config to None to disable all the above configuration
 CELERY_CONFIG = None
+"""
 
 # static http headers to be served by your Superset server.
 # This header prevents iFrames from other domains and
@@ -450,6 +475,54 @@ class CeleryConfig(object):
 # using flask-compress
 ENABLE_FLASK_COMPRESS = True
 
+# Enable / disable scheduled email reports
+ENABLE_SCHEDULED_EMAIL_REPORTS = False
+
+# If enabled, certail features are run in debug mode
+# Current list:
+# * Emails are sent using dry-run mode (logging only)
+SCHEDULED_EMAIL_DEBUG_MODE = False
+
+# Email reports - minimum time resolution (in minutes) for the crontab
+EMAIL_REPORTS_CRON_RESOLUTION = 15
+
+# Email report configuration
+# From address in emails
+EMAIL_REPORT_FROM_ADDRESS = 'reports@superset.org'
+
+# Send bcc of all reports to this address. Set to None to disable.
+# This is useful for maintaining an audit trail of all email deliveries.
+EMAIL_REPORT_BCC_ADDRESS = None
+
+# User credentials to use for generating reports
+# This user should have permissions to browse all the dashboards and
+# slices.
+# TODO: In the future, login as the owner of the item to generate reports
+EMAIL_REPORTS_USER = 'admin'
+EMAIL_REPORTS_SUBJECT_PREFIX = '[Report] '
+
+# The webdriver to use for generating reports. Use one of the following
+# firefox
+#   Requires: geckodriver and firefox installations
+#   Limitations: can be buggy at times
+# chrome:
+#   Requires: headless chrome
+#   Limitations: unable to generate screenshots of elements
+EMAIL_REPORTS_WEBDRIVER = 'firefox'
+
+# Window size - this will impact the rendering of the data
+WEBDRIVER_WINDOW = {
+    'dashboard': (1600, 2000),
+    'slice': (3000, 1200),
+}
+
+# Any config options to be passed as-is to the webdriver
+WEBDRIVER_CONFIGURATION = {}
+
+# The base URL to query for accessing the user interface
+WEBDRIVER_BASEURL = 'http://0.0.0.0:8080/'
+
+
 try:
     if CONFIG_PATH_ENV_VAR in os.environ:
         # Explicitly import config module that is not in pythonpath; useful

@@ -0,0 +1,69 @@
+"""models for email reports
+
+Revision ID: 6c7537a6004a
+Revises: e502db2af7be
+Create Date: 2018-05-15 20:28:51.977572
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '6c7537a6004a'
+down_revision = 'a61b40f9f57f'
+
+from alembic import op
+import sqlalchemy as sa
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('dashboard_email_schedules',
+    sa.Column('created_on', sa.DateTime(), nullable=True),
+    sa.Column('changed_on', sa.DateTime(), nullable=True),
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('active', sa.Boolean(), nullable=True),
+    sa.Column('crontab', sa.String(length=50), nullable=True),
+    sa.Column('recipients', sa.Text(), nullable=True),
+    sa.Column('deliver_as_group', sa.Boolean(), nullable=True),
+    sa.Column('delivery_type', sa.Enum('attachment', 'inline', name='emaildeliverytype'), nullable=True),
+    sa.Column('dashboard_id', sa.Integer(), nullable=True),
+    sa.Column('created_by_fk', sa.Integer(), nullable=True),
+    sa.Column('changed_by_fk', sa.Integer(), nullable=True),
+    sa.Column('user_id', sa.Integer(), nullable=True),
+    sa.ForeignKeyConstraint(['changed_by_fk'], ['ab_user.id'], ),
+    sa.ForeignKeyConstraint(['created_by_fk'], ['ab_user.id'], ),
+    sa.ForeignKeyConstraint(['dashboard_id'], ['dashboards.id'], ),
+    sa.ForeignKeyConstraint(['user_id'], ['ab_user.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index(op.f('ix_dashboard_email_schedules_active'), 'dashboard_email_schedules', ['active'], unique=False)
+    op.create_table('slice_email_schedules',
+    sa.Column('created_on', sa.DateTime(), nullable=True),
+    sa.Column('changed_on', sa.DateTime(), nullable=True),
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('active', sa.Boolean(), nullable=True),
+    sa.Column('crontab', sa.String(length=50), nullable=True),
+    sa.Column('recipients', sa.Text(), nullable=True),
+    sa.Column('deliver_as_group', sa.Boolean(), nullable=True),
+    sa.Column('delivery_type', sa.Enum('attachment', 'inline', name='emaildeliverytype'), nullable=True),
+    sa.Column('slice_id', sa.Integer(), nullable=True),
+    sa.Column('email_format', sa.Enum('visualization', 'data', name='sliceemailreportformat'), nullable=True),
+    sa.Column('created_by_fk', sa.Integer(), nullable=True),
+    sa.Column('changed_by_fk', sa.Integer(), nullable=True),
+    sa.Column('user_id', sa.Integer(), nullable=True),
+    sa.ForeignKeyConstraint(['changed_by_fk'], ['ab_user.id'], ),
+    sa.ForeignKeyConstraint(['created_by_fk'], ['ab_user.id'], ),
+    sa.ForeignKeyConstraint(['slice_id'], ['slices.id'], ),
+    sa.ForeignKeyConstraint(['user_id'], ['ab_user.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index(op.f('ix_slice_email_schedules_active'), 'slice_email_schedules', ['active'], unique=False)
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_slice_email_schedules_active'), table_name='slice_email_schedules')
+    op.drop_table('slice_email_schedules')
+    op.drop_index(op.f('ix_dashboard_email_schedules_active'), table_name='dashboard_email_schedules')
+    op.drop_table('dashboard_email_schedules')
+    # ### end Alembic commands ###
diff --git a/superset/models/__init__.py b/superset/models/__init__.py
@@ -1,3 +1,4 @@
 from . import core  # noqa
 from . import sql_lab  # noqa
 from . import user_attributes  # noqa
+from . import schedules # noqa