From cdc615fbe9025915cef5975181698e6f0ee8d766 Mon Sep 17 00:00:00 2001
From: Harsh <80324346+harshpatel4crest@users.noreply.github.com>
Date: Wed, 1 Jun 2022 15:26:06 +0530
Subject: [PATCH 1/4] TDL-7301: Handle multiline `CRITICAL` error messages
 (#137)

* updated code to add CRITICAL for multiline error messages

* run unittest in CCi

* updated config.yml file
---
 .circleci/config.yml                          | 23 +++++++++++++++
 tap_salesforce/__init__.py                    |  3 +-
 .../test_multiline_critical_error_message.py  | 29 +++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 tests/unittests/test_multiline_critical_error_message.py
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 52e47638..bed7da97 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -45,6 +45,23 @@ jobs:
             pip install pylint
             echo "pylint will skip the following: $PYLINT_DISABLE_LIST"
             pylint tap_salesforce -d "$PYLINT_DISABLE_LIST,stop-iteration-return"
+  run_unit_tests:
+    executor: docker-executor
+    steps:
+      - checkout
+      - attach_workspace:
+          at: /usr/local/share/virtualenvs
+      - run:
+          name: 'Run Unit Tests'
+          command: |
+            source /usr/local/share/virtualenvs/tap-salesforce/bin/activate
+            pip install nose coverage
+            nosetests --with-coverage --cover-erase --cover-package=tap_salesforce --cover-html-dir=htmlcov tests/unittests
+            coverage html
+      - store_test_results:
+          path: test_output/report.xml
+      - store_artifacts:
+          path: htmlcov
   run_integration_tests:
     executor: docker-executor
     parallelism: 8
@@ -83,6 +100,12 @@ workflows:
             - tier-1-tap-user
           requires:
             - ensure_env
+      - run_unit_tests:
+          context:
+            - circleci-user
+            - tier-1-tap-user
+          requires:
+            - ensure_env
       - run_integration_tests:
           context:
             - circleci-user
diff --git a/tap_salesforce/__init__.py b/tap_salesforce/__init__.py
index 8dae2651..07aba2c9 100644
--- a/tap_salesforce/__init__.py
+++ b/tap_salesforce/__init__.py
@@ -417,5 +417,6 @@ def main():
         LOGGER.critical(e)
         sys.exit(1)
     except Exception as e:
-        LOGGER.critical(e)
+        for error_line in str(e).splitlines():
+            LOGGER.critical(error_line)
         raise e
diff --git a/tests/unittests/test_multiline_critical_error_message.py b/tests/unittests/test_multiline_critical_error_message.py
new file mode 100644
index 00000000..b7fe47e5
--- /dev/null
+++ b/tests/unittests/test_multiline_critical_error_message.py
@@ -0,0 +1,29 @@
+import unittest
+from unittest import mock
+from tap_salesforce import main
+
+# mock "main_impl" and raise multiline error
+def raise_error():
+    raise Exception("""Error syncing Transaction__c: 400 Client Error: Bad Request for url: https://test.my.salesforce.com/services/async/41.0/job/7502K00000IcACtQAN/batch/test123j/result/test123b Response: <?xml version="1.0" encoding="UTF-8"?><error
+       xmlns="http://www.test-force.com/20091/01/asyncapi1/dataload1">
+     <exceptionCode>InvalidSessionId</exceptionCode>
+     <exceptionMessage>Invalid session id</exceptionMessage>
+    </error>""")
+
+class TestMultiLineCriticalErrorMessage(unittest.TestCase):
+    """
+        Test case to verify every line in the multiline error contains 'CRITICAL'
+    """
+
+    @mock.patch("tap_salesforce.LOGGER.critical")
+    @mock.patch("tap_salesforce.main_impl")
+    def test_multiline_critical_error_message(self, mocked_main_impl, mocked_logger_critical):
+        # mock "main_impl" and raise multiline error
+        mocked_main_impl.side_effect = raise_error
+
+        # verify "Exception" is raise on function call
+        with self.assertRaises(Exception):
+            main()
+
+        # verify "LOGGER.critical" is called 5 times, as the error raised contains 5 lines
+        self.assertEqual(mocked_logger_critical.call_count, 5)

From 7ea4e5279ac7dc98cc01e608df67df25a6b89990 Mon Sep 17 00:00:00 2001
From: karanpanchal-crest <karan.panchal@crestdatasys.com>
Date: Wed, 1 Jun 2022 15:38:14 +0530
Subject: [PATCH 2/4] TDL-15349 Add Date Windowing (#140)

* add date windowing

* resolved pylint error

* resolved pylint error

* resolved pylint error

* add unittest

* added commnets for logic explanation

* resolved pylint error

* update config.yml

* add remaining unittest

* add change in bulk

* resolved pylint error

* update config.yml

* added unittest for date window gaps

* resolve unittest failure

* resolved review comments

* resolved pylint duplicate code error

Co-authored-by: harshpatel4crest <harsh.patel4@crestdatasys.com>
---
 .circleci/config.yml                          |   4 +-
 tap_salesforce/salesforce/__init__.py         |  11 ++
 tap_salesforce/salesforce/bulk.py             |  94 +++++++----
 tap_salesforce/salesforce/rest.py             |   8 +-
 .../test_salesforce_date_windowing.py         | 158 ++++++++++++++++++
 5 files changed, 236 insertions(+), 39 deletions(-)
 create mode 100644 tests/unittests/test_salesforce_date_windowing.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index bed7da97..1ecc1aa7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -44,7 +44,7 @@ jobs:
             source /usr/local/share/virtualenvs/tap-salesforce/bin/activate
             pip install pylint
             echo "pylint will skip the following: $PYLINT_DISABLE_LIST"
-            pylint tap_salesforce -d "$PYLINT_DISABLE_LIST,stop-iteration-return"
+            pylint tap_salesforce -d "$PYLINT_DISABLE_LIST,stop-iteration-return,logging-format-interpolation"
   run_unit_tests:
     executor: docker-executor
     steps:
@@ -127,4 +127,4 @@ workflows:
           filters:
             branches:
               only:
-                - master
+                - master
\ No newline at end of file
diff --git a/tap_salesforce/salesforce/__init__.py b/tap_salesforce/salesforce/__init__.py
index 0ac961ad..b366388a 100644
--- a/tap_salesforce/salesforce/__init__.py
+++ b/tap_salesforce/salesforce/__init__.py
@@ -447,3 +447,14 @@ def get_blacklisted_fields(self):
             raise TapSalesforceException(
                 "api_type should be REST or BULK was: {}".format(
                     self.api_type))
+
+    def get_window_end_date(self, start_date, end_date):
+        # to update end_date, substract 'half_day_range' (i.e. half of the days between start_date and end_date)
+        # when the 'half_day_range' is an odd number, we will round down to the nearest integer because of the '//'
+        half_day_range = (end_date - start_date) // 2
+
+        if half_day_range.days == 0:
+            raise TapSalesforceException(
+                "Attempting to query by 0 day range, this would cause infinite looping.")
+
+        return end_date - half_day_range
diff --git a/tap_salesforce/salesforce/bulk.py b/tap_salesforce/salesforce/bulk.py
index e6596bd1..3b99073b 100644
--- a/tap_salesforce/salesforce/bulk.py
+++ b/tap_salesforce/salesforce/bulk.py
@@ -5,6 +5,7 @@
 import time
 import tempfile
 import singer
+import singer.utils as singer_utils
 from singer import metrics
 import requests
 from requests.exceptions import RequestException
@@ -18,7 +19,7 @@
 PK_CHUNKED_BATCH_STATUS_POLLING_SLEEP = 60
 ITER_CHUNK_SIZE = 1024
 DEFAULT_CHUNK_SIZE = 100000 # Max is 250000
-
+MAX_RETRIES = 4
 LOGGER = singer.get_logger()
 
 # pylint: disable=inconsistent-return-statements
@@ -117,25 +118,28 @@ def _bulk_query(self, catalog_entry, state):
 
         if batch_status['state'] == 'Failed':
             if self._can_pk_chunk_job(batch_status['stateMessage']):
-                batch_status = self._bulk_query_with_pk_chunking(catalog_entry, start_date)
-                job_id = batch_status['job_id']
-
-                # Set pk_chunking to True to indicate that we should write a bookmark differently
-                self.sf.pk_chunking = True
-
-                # Add the bulk Job ID and its batches to the state so it can be resumed if necessary
-                tap_stream_id = catalog_entry['tap_stream_id']
-                state = singer.write_bookmark(state, tap_stream_id, 'JobID', job_id)
-                state = singer.write_bookmark(state, tap_stream_id, 'BatchIDs', batch_status['completed'][:])
-
-                for completed_batch_id in batch_status['completed']:
-                    for result in self.get_batch_results(job_id, completed_batch_id, catalog_entry):
-                        yield result
-                    # Remove the completed batch ID and write state
-                    state['bookmarks'][catalog_entry['tap_stream_id']]["BatchIDs"].remove(completed_batch_id)
-                    LOGGER.info("Finished syncing batch %s. Removing batch from state.", completed_batch_id)
-                    LOGGER.info("Batches to go: %d", len(state['bookmarks'][catalog_entry['tap_stream_id']]["BatchIDs"]))
-                    singer.write_state(state)
+                # Get list of batch_status with pk_chunking or date_windowing
+                status_list = self._bulk_with_window([], catalog_entry, start_date)
+
+                for batch_status in status_list:
+                    job_id = batch_status['job_id']
+
+                    # Set pk_chunking to True to indicate that we should write a bookmark differently
+                    self.sf.pk_chunking = True
+
+                    # Add the bulk Job ID and its batches to the state so it can be resumed if necessary
+                    tap_stream_id = catalog_entry['tap_stream_id']
+                    state = singer.write_bookmark(state, tap_stream_id, 'JobID', job_id)
+                    state = singer.write_bookmark(state, tap_stream_id, 'BatchIDs', batch_status['completed'][:])
+
+                    for completed_batch_id in batch_status['completed']:
+                        for result in self.get_batch_results(job_id, completed_batch_id, catalog_entry):
+                            yield result
+                        # Remove the completed batch ID and write state
+                        state['bookmarks'][catalog_entry['tap_stream_id']]["BatchIDs"].remove(completed_batch_id)
+                        LOGGER.info("Finished syncing batch %s. Removed batch from state.", completed_batch_id)
+                        LOGGER.info("Batches to go: %d", len(state['bookmarks'][catalog_entry['tap_stream_id']]["BatchIDs"]))
+                        singer.write_state(state)
             else:
                 raise TapSalesforceException(batch_status['stateMessage'])
         else:
@@ -148,18 +152,11 @@ def _bulk_query_with_pk_chunking(self, catalog_entry, start_date):
         # Create a new job
         job_id = self._create_job(catalog_entry, True)
 
-        self._add_batch(catalog_entry, job_id, start_date, False)
+        self._add_batch(catalog_entry, job_id, start_date, order_by_clause=False)
 
         batch_status = self._poll_on_pk_chunked_batch_status(job_id)
         batch_status['job_id'] = job_id
 
-        if batch_status['failed']:
-            raise TapSalesforceException(
-                "One or more batches failed during PK chunked job. {} failed out of {} total batches. First 20 failed batches: {}".format(
-                    len(batch_status['failed']),
-                    len(batch_status['completed']) + len(batch_status['failed']),
-                    list(batch_status['failed'].items())[:20]))
-
         # Close the job after all the batches are complete
         self._close_job(job_id)
 
@@ -194,11 +191,11 @@ def _create_job(self, catalog_entry, pk_chunking=False):
 
         return job['id']
 
-    def _add_batch(self, catalog_entry, job_id, start_date, order_by_clause=True):
+    def _add_batch(self, catalog_entry, job_id, start_date, end_date=None, order_by_clause=True):
         endpoint = "job/{}/batch".format(job_id)
         url = self.bulk_url.format(self.sf.instance_url, endpoint)
 
-        body = self.sf._build_query_string(catalog_entry, start_date, order_by_clause=order_by_clause)
+        body = self.sf._build_query_string(catalog_entry, start_date, end_date, order_by_clause=order_by_clause)
 
         headers = self._get_bulk_headers()
         headers['Content-Type'] = 'text/csv'
@@ -357,3 +354,40 @@ def _iter_lines(self, response):
 
         if pending is not None:
             yield pending
+
+    def _bulk_with_window(self, status_list, catalog_entry, start_date_str, end_date=None, retries=MAX_RETRIES):
+        """Bulk api call with date windowing"""
+        sync_start = singer_utils.now()
+        if end_date is None:
+            end_date = sync_start
+            LOGGER.info("Retrying Bulk Query with PK Chunking")
+        else:
+            LOGGER.info("Retrying Bulk Query with window of date {} to {}".format(start_date_str, end_date.strftime('%Y-%m-%dT%H:%M:%SZ')))
+
+        if retries == 0:
+            raise TapSalesforceException("Ran out of retries attempting to query Salesforce Object {}".format(catalog_entry['stream']))
+
+        job_id = self._create_job(catalog_entry, True)
+        self._add_batch(catalog_entry, job_id, start_date_str, end_date.strftime('%Y-%m-%dT%H:%M:%SZ'), False)
+        batch_status = self._poll_on_pk_chunked_batch_status(job_id)
+        batch_status['job_id'] = job_id
+        # Close the job after all the batches are complete
+        self._close_job(job_id)
+
+        if batch_status['failed']:
+            LOGGER.info("Failed Bulk Query with window of date {} to {}".format(start_date_str, end_date.strftime('%Y-%m-%dT%H:%M:%SZ')))
+            # If batch_status is failed then reduce date window by half by updating end_date
+            end_date = self.sf.get_window_end_date(singer_utils.strptime_with_tz(start_date_str), end_date)
+
+            return self._bulk_with_window(status_list, catalog_entry, start_date_str, end_date, retries - 1)
+
+        else:
+            status_list.append(batch_status)
+
+            # If the date range was chunked (an end_date was passed), sync
+            # from the end_date -> now
+            if end_date < sync_start:
+                next_start_date_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
+                return self._bulk_with_window(status_list, catalog_entry, next_start_date_str, retries=retries)
+
+            return status_list
diff --git a/tap_salesforce/salesforce/rest.py b/tap_salesforce/salesforce/rest.py
index 92c120bf..408cd4d0 100644
--- a/tap_salesforce/salesforce/rest.py
+++ b/tap_salesforce/salesforce/rest.py
@@ -71,13 +71,7 @@ def _query_recur(
                 raise ex
 
         if retryable:
-            start_date = singer_utils.strptime_with_tz(start_date_str)
-            half_day_range = (end_date - start_date) // 2
-            end_date = end_date - half_day_range
-
-            if half_day_range.days == 0:
-                raise TapSalesforceException(
-                    "Attempting to query by 0 day range, this would cause infinite looping.")
+            end_date = self.sf.get_window_end_date(singer_utils.strptime_with_tz(start_date_str), end_date)
 
             query = self.sf._build_query_string(catalog_entry, singer_utils.strftime(start_date),
                                                 singer_utils.strftime(end_date))
diff --git a/tests/unittests/test_salesforce_date_windowing.py b/tests/unittests/test_salesforce_date_windowing.py
new file mode 100644
index 00000000..6f0e937b
--- /dev/null
+++ b/tests/unittests/test_salesforce_date_windowing.py
@@ -0,0 +1,158 @@
+import datetime
+import unittest
+from unittest import mock
+from tap_salesforce import Salesforce
+from tap_salesforce.salesforce import Bulk
+from dateutil import tz
+import singer
+
+# function to return batch status as 'failed' to force date windowing
+def mocked_batch_status(count):
+    if count < 2:
+        return {"failed": "test"}
+    else:
+        return {"failed": {}}
+
+@mock.patch('tap_salesforce.salesforce.Bulk._close_job')
+@mock.patch('tap_salesforce.salesforce.Bulk._poll_on_pk_chunked_batch_status', side_effect = mocked_batch_status)
+@mock.patch('tap_salesforce.salesforce.Bulk._create_job', side_effect=[1, 2, 3, 4, 5])
+@mock.patch('tap_salesforce.salesforce.bulk.singer_utils.now')
+class TestBulkDateWindow(unittest.TestCase):
+
+    # start date
+    start_date = '2019-02-04T12:15:00Z'
+    # 'Salesforce' object
+    sf = Salesforce(
+        default_start_date=start_date,
+        api_type="BULK")
+    # dummy catalog entry
+    catalog_entry = {
+        'stream': 'User',
+        'tap_stream_id': 'User',
+        'schema': {
+            "properties": {
+                "Id": {
+                    "type": "string"
+                },
+                "SystemModstamp": {
+                    "anyOf": [{
+                            "type": "string",
+                            "format": "date-time"
+                        },
+                        {
+                            "type": [
+                                "string",
+                                "null"
+                            ]
+                        }
+                    ]
+                }
+            }
+        },
+        'metadata': [
+            {
+                "breadcrumb": [],
+                "metadata": {
+                    "selected": True,
+                    "replication-key": "SystemModstamp",
+                    "table-key-properties": [
+                        "Id"
+                    ]
+                }
+            },
+            {
+                "breadcrumb": [
+                    "properties",
+                    "SystemModstamp"
+                ],
+                "metadata": {
+                    "inclusion": "automatic"
+                }
+            },
+            {
+                "breadcrumb": [
+                    "properties",
+                    "Id"
+                ],
+                "metadata": {
+                    "inclusion": "automatic"
+                }
+            }
+        ]
+    }
+
+    # mocked now date
+    now_date_1 = datetime.datetime(2022, 5, 2, 12, 15, 00, tzinfo=tz.UTC)
+    now_date_1_str = now_date_1.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    # mocked now date
+    now_date_2 = datetime.datetime(2019, 2, 5, 12, 15, 00, tzinfo=tz.UTC)
+
+    @mock.patch('tap_salesforce.salesforce.Bulk._add_batch')
+    def test_bulk_date_windowing_with_max_retries_0(self, mocked_add_batch, mocked_singer_util_now, mocked_create_job, mocked_batch_status, mocked_close_job):
+        """
+        To verify that if data is too large then date windowing mechanism execute, 
+        but after retrying upto MAX_RETRIES still not get data then raise proper exception
+        """
+
+        mocked_singer_util_now.return_value = self.now_date_1
+
+        with self.assertRaises(Exception) as e:
+            Bulk(self.sf)._bulk_with_window([], self.catalog_entry, self.start_date, retries=0)
+
+        self.assertEqual(str(e.exception), 'Ran out of retries attempting to query Salesforce Object User', "Not get expected Exception")
+
+    @mock.patch('tap_salesforce.salesforce.Bulk._add_batch')
+    def test_bulk_date_windowing_with_half_day_range_0(self, mocked_add_batch, mocked_singer_util_now, mocked_create_job, mocked_batch_status, mocked_close_job):
+        """
+        To verify that if data is too large then date windowing mechanism execute, 
+        but after retrying window goes to 0 days, still not get data then raise proper exception
+        """
+
+        mocked_singer_util_now.return_value = self.now_date_2
+
+        with self.assertRaises(Exception) as e:
+            Bulk(self.sf)._bulk_with_window([], self.catalog_entry, self.start_date)
+
+        self.assertEqual(str(e.exception), 'Attempting to query by 0 day range, this would cause infinite looping.', "Not get expected Exception")
+
+    @mock.patch('xmltodict.parse')
+    @mock.patch('tap_salesforce.salesforce.Salesforce._make_request')
+    def test_bulk_date_window_gaps(self, mocked_make_request, mocked_xmltodict_parse, mocked_singer_util_now, mocked_create_job, mocked_batch_status, mocked_close_job):
+        """
+        Test case to verify there are no gaps in the date window
+        """
+
+        # mock singer.now
+        mocked_singer_util_now.return_value = self.now_date_1
+        # mock xmltodict.parse
+        mocked_xmltodict_parse.return_value = {
+            "batchInfo": {
+                "id": 1234
+            }
+        }
+
+        # function call with start date
+        Bulk(self.sf)._bulk_with_window([], self.catalog_entry, self.start_date)
+
+        # collect 'body' (query) from '_make_request' function arguments
+        actual_queries = [kwargs.get("body") for args, kwargs in mocked_make_request.call_args_list]
+
+        # calculate half window date for assertion
+        half_day = (self.now_date_1 - singer.utils.strptime_with_tz(self.start_date)) // 2
+        half_window_date = (self.now_date_1 - half_day).strftime('%Y-%m-%dT%H:%M:%SZ')
+
+        # create expected queries
+        expected_queries = [
+            # failed call of whole date window ie. start date to now date
+            f'SELECT Id,SystemModstamp FROM User WHERE SystemModstamp >= {self.start_date}  AND SystemModstamp < {self.now_date_1_str}',
+            # date window divided into half, query from start date to half window
+            f'SELECT Id,SystemModstamp FROM User WHERE SystemModstamp >= {self.start_date}  AND SystemModstamp < {half_window_date}',
+            # query from half window to now date
+            f'SELECT Id,SystemModstamp FROM User WHERE SystemModstamp >= {half_window_date}  AND SystemModstamp < {self.now_date_1_str}'
+        ]
+
+        # verify we called '_make_request' 3 times
+        self.assertEqual(mocked_make_request.call_count, 3, "Function is not called expected times")
+        # verify the queries are called as expected
+        self.assertEqual(actual_queries, expected_queries)

From 1cc8d0378dff0f600960b3ede336135537f9f12c Mon Sep 17 00:00:00 2001
From: karanpanchal-crest <karan.panchal@crestdatasys.com>
Date: Wed, 1 Jun 2022 15:43:59 +0530
Subject: [PATCH 3/4] TDL-16006: Fix interruptible full table bookmarking
 strategy for bulk API (#138)

* Fix Full Table Bookmarking Stratergy for bulk Api

* add detialed comments

* update config.yml

* remove unused pakage

* updated unittest

Co-authored-by: harshpatel4crest <harsh.patel4@crestdatasys.com>
---
 tap_salesforce/sync.py                        |  4 ++--
 .../test_salesforce_null_bookmark.py          | 24 +++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 tests/unittests/test_salesforce_null_bookmark.py

diff --git a/tap_salesforce/sync.py b/tap_salesforce/sync.py
index a9de4f14..65ee3024 100644
--- a/tap_salesforce/sync.py
+++ b/tap_salesforce/sync.py
@@ -173,8 +173,8 @@ def sync_records(sf, catalog_entry, state, counter):
         state = singer.write_bookmark(
             state, catalog_entry['tap_stream_id'], 'version', None)
 
-    # If pk_chunking is set, only write a bookmark at the end
-    if sf.pk_chunking:
+    # If pk_chunking is set, and selected streams has replication key then only write a bookmark at the end
+    if sf.pk_chunking and replication_key:
         # Write a bookmark with the highest value we've seen
         state = singer.write_bookmark(
             state,
diff --git a/tests/unittests/test_salesforce_null_bookmark.py b/tests/unittests/test_salesforce_null_bookmark.py
new file mode 100644
index 00000000..a477933a
--- /dev/null
+++ b/tests/unittests/test_salesforce_null_bookmark.py
@@ -0,0 +1,24 @@
+import unittest
+from unittest import mock
+from tap_salesforce import Salesforce, metrics
+from tap_salesforce.sync import sync_records
+import json
+
+class TestNullBookmarkTesting(unittest.TestCase):
+    @mock.patch('tap_salesforce.salesforce.Salesforce.query', side_effect=lambda test1, test2: [])
+    def test_not_null_bookmark_for_incremental_stream(self, mocked_query):
+        """
+        To ensure that after resolving write bookmark logic not get "Null" as replication key in state file, 
+        When we have selected incremental stream as Full table stream
+
+        """
+        sf = Salesforce(default_start_date='2019-02-04T12:15:00Z', api_type="BULK")
+
+        sf.pk_chunking = True
+        catalog_entry = {"stream": "OpportunityLineItem", "schema": {}, "metadata":[], "tap_stream_id": "OpportunityLineItem"}
+        state = {}
+        counter = metrics.record_counter('OpportunityLineItem')
+        sync_records(sf, catalog_entry, state, counter)
+        # write state function convert python dictionary to json string
+        state = json.dumps(state)
+        self.assertEqual(state, '{"bookmarks": {"OpportunityLineItem": {"version": null}}}', "Not get expected state value")
\ No newline at end of file

From 87035fa423a5ac19a43fac422e792a833369cc34 Mon Sep 17 00:00:00 2001
From: Harsh <80324346+harshpatel4crest@users.noreply.github.com>
Date: Fri, 3 Jun 2022 10:17:08 +0530
Subject: [PATCH 4/4] TDL-16115: Add lookback window advanced config option for
 Incremental Syncs (#135)

* added lookback window

* resolved pylint and unittest error

* resolved unittest error

* updated lookback window code

* updated lookback code to work for incremental syncs

* added some comments

* updated comments

* added function comments

* resolved review comments

* addressed review comments

* updated lookback window integration test and unittests

* Update unit tests to match expected implementation (#141)

We do not want to apply the lookback window to the start date

* updated the code to use lookback if present

* resolved CCi failure and updated README file

* set Account as INCREMENTAL before sync

* updated state date and fixed pylint error

Co-authored-by: Andy Lu <andy@stitchdata.com>
---
 README.md                                |   3 +
 tap_salesforce/__init__.py               |   7 +-
 tap_salesforce/salesforce/__init__.py    |  24 +++--
 tap_salesforce/salesforce/bulk.py        |   3 +-
 tests/test_salesforce_lookback_window.py | 107 +++++++++++++++++++++++
 tests/unittests/test_lookback_window.py  |  97 ++++++++++++++++++++
 6 files changed, 232 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_salesforce_lookback_window.py
 create mode 100644 tests/unittests/test_lookback_window.py

diff --git a/README.md b/README.md
index ef66e3fc..be556e11 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ $ tap-salesforce --config config.json --properties properties.json --state state
   "start_date": "2017-11-02T00:00:00Z",
   "api_type": "BULK",
   "select_fields_by_default": true
+  "lookback_window": 10
 }
 ```
 
@@ -40,6 +41,8 @@ The `start_date` is used by the tap as a bound on SOQL queries when searching fo
 
 The `api_type` is used to switch the behavior of the tap between using Salesforce's "REST" and "BULK" APIs. When new fields are discovered in Salesforce objects, the `select_fields_by_default` key describes whether or not the tap will select those fields by default.
 
+The `lookback_window` (in seconds) subtracts the desired amount of seconds from the bookmark to sync past data. Recommended value: 10 seconds.
+
 ## Run Discovery
 
 To run discovery mode, execute the tap with the config file.
diff --git a/tap_salesforce/__init__.py b/tap_salesforce/__init__.py
index 07aba2c9..a0fb6874 100644
--- a/tap_salesforce/__init__.py
+++ b/tap_salesforce/__init__.py
@@ -375,6 +375,10 @@ def main_impl():
 
     sf = None
     try:
+        # get lookback window from config
+        lookback_window = CONFIG.get('lookback_window')
+        lookback_window = int(lookback_window) if lookback_window else None
+
         sf = Salesforce(
             refresh_token=CONFIG['refresh_token'],
             sf_client_id=CONFIG['client_id'],
@@ -384,7 +388,8 @@ def main_impl():
             is_sandbox=CONFIG.get('is_sandbox'),
             select_fields_by_default=CONFIG.get('select_fields_by_default'),
             default_start_date=CONFIG.get('start_date'),
-            api_type=CONFIG.get('api_type'))
+            api_type=CONFIG.get('api_type'),
+            lookback_window=lookback_window)
         sf.login()
 
         if args.discover:
diff --git a/tap_salesforce/salesforce/__init__.py b/tap_salesforce/salesforce/__init__.py
index b366388a..493f1ba6 100644
--- a/tap_salesforce/salesforce/__init__.py
+++ b/tap_salesforce/salesforce/__init__.py
@@ -1,3 +1,4 @@
+import datetime
 import re
 import threading
 import time
@@ -210,7 +211,8 @@ def __init__(self,
                  is_sandbox=None,
                  select_fields_by_default=None,
                  default_start_date=None,
-                 api_type=None):
+                 api_type=None,
+                 lookback_window=None):
         self.api_type = api_type.upper() if api_type else None
         self.refresh_token = refresh_token
         self.token = token
@@ -235,9 +237,10 @@ def __init__(self,
         self.login_timer = None
         self.data_url = "{}/services/data/v52.0/{}"
         self.pk_chunking = False
+        self.lookback_window = lookback_window
 
         # validate start_date
-        singer_utils.strptime(default_start_date)
+        singer_utils.strptime_to_utc(default_start_date)
 
     def _get_standard_headers(self):
         return {"Authorization": "Bearer {}".format(self.access_token)}
@@ -370,7 +373,6 @@ def describe(self, sobject=None):
 
         return resp.json()
 
-    # pylint: disable=no-self-use
     def _get_selected_properties(self, catalog_entry):
         mdata = metadata.to_map(catalog_entry['metadata'])
         properties = catalog_entry['schema'].get('properties', {})
@@ -382,12 +384,22 @@ def _get_selected_properties(self, catalog_entry):
 
 
     def get_start_date(self, state, catalog_entry):
+        """
+            return start date if state is not provided
+            else return bookmark from the state by subtracting lookback if provided
+        """
         catalog_metadata = metadata.to_map(catalog_entry['metadata'])
         replication_key = catalog_metadata.get((), {}).get('replication-key')
 
-        return (singer.get_bookmark(state,
-                                    catalog_entry['tap_stream_id'],
-                                    replication_key) or self.default_start_date)
+        # get bookmark value from the state
+        bookmark_value = singer.get_bookmark(state, catalog_entry['tap_stream_id'], replication_key)
+        sync_start_date = bookmark_value or self.default_start_date
+
+        # if the state contains a bookmark, subtract the lookback window from the bookmark
+        if bookmark_value and self.lookback_window:
+            sync_start_date = singer_utils.strftime(singer_utils.strptime_with_tz(sync_start_date) - datetime.timedelta(seconds=self.lookback_window))
+
+        return sync_start_date
 
     def _build_query_string(self, catalog_entry, start_date, end_date=None, order_by_clause=True):
         selected_properties = self._get_selected_properties(catalog_entry)
diff --git a/tap_salesforce/salesforce/bulk.py b/tap_salesforce/salesforce/bulk.py
index 3b99073b..081db803 100644
--- a/tap_salesforce/salesforce/bulk.py
+++ b/tap_salesforce/salesforce/bulk.py
@@ -101,7 +101,7 @@ def _get_bulk_headers(self):
         return {"X-SFDC-Session": self.sf.access_token,
                 "Content-Type": "application/json"}
 
-    def _can_pk_chunk_job(self, failure_message): # pylint: disable=no-self-use
+    def _can_pk_chunk_job(self, failure_message):
         return "QUERY_TIMEOUT" in failure_message or \
                "Retried more than 15 times" in failure_message or \
                "Failed to write query result" in failure_message
@@ -331,7 +331,6 @@ def _close_job(self, job_id):
                 headers=self._get_bulk_headers(),
                 body=json.dumps(body))
 
-    # pylint: disable=no-self-use
     def _iter_lines(self, response):
         """Clone of the iter_lines function from the requests library with the change
         to pass keepends=True in order to ensure that we do not strip the line breaks
diff --git a/tests/test_salesforce_lookback_window.py b/tests/test_salesforce_lookback_window.py
new file mode 100644
index 00000000..915a2b06
--- /dev/null
+++ b/tests/test_salesforce_lookback_window.py
@@ -0,0 +1,107 @@
+from datetime import datetime, timedelta
+from tap_tester import connections, runner, menagerie
+from base import SalesforceBaseTest
+
+class SalesforceLookbackWindow(SalesforceBaseTest):
+
+    # subtract the desired amount of seconds form the date and return
+    def get_simulated_date(self, dtime, format, seconds=0):
+        date_stripped = datetime.strptime(dtime, format)
+        return_date = date_stripped - timedelta(seconds=seconds)
+
+        return datetime.strftime(return_date, format)
+
+    @staticmethod
+    def name():
+        return 'tap_tester_salesforce_lookback_window'
+
+    def get_properties(self):  # pylint: disable=arguments-differ
+        return {
+            'start_date' : '2021-11-10T00:00:00Z',
+            'instance_url': 'https://singer2-dev-ed.my.salesforce.com',
+            'select_fields_by_default': 'true',
+            'api_type': self.salesforce_api,
+            'is_sandbox': 'false',
+            'lookback_window': 86400
+        }
+
+    def expected_sync_streams(self):
+        return {
+            'Account'
+        }
+
+    def run_test(self):
+        # create connection
+        conn_id = connections.ensure_connection(self)
+        # create state file
+        state = {
+            'bookmarks':{
+                'Account': {
+                    'SystemModstamp': '2021-11-12T00:00:00.000000Z'
+                }
+            }
+        }
+        # set state file to run in sync mode
+        menagerie.set_state(conn_id, state)
+
+        # run in check mode
+        found_catalogs = self.run_and_verify_check_mode(conn_id)
+
+        # select certain catalogs
+        expected_streams = self.expected_sync_streams()
+        catalog_entries = [catalog for catalog in found_catalogs
+                            if catalog.get('tap_stream_id') in expected_streams]
+
+        # stream and field selection
+        self.select_all_streams_and_fields(conn_id, catalog_entries)
+
+        # make 'Account' stream as INCREMENTAL to use lookback window
+        self.set_replication_methods(conn_id, catalog_entries, {'Account': self.INCREMENTAL})
+
+        # run sync
+        self.run_and_verify_sync(conn_id)
+
+        # get synced records
+        sync_records = runner.get_records_from_target_output()
+
+        # get replication keys
+        expected_replication_keys = self.expected_replication_keys()
+
+        # get bookmark ie. date from which the sync started
+        bookmark = state.get('bookmarks').get('Account').get('SystemModstamp')
+        # calculate the simulated bookmark by subtracting lookback window seconds
+        bookmark_with_lookback_window = self.get_simulated_date(bookmark, format=self.BOOKMARK_COMPARISON_FORMAT, seconds=self.get_properties()['lookback_window'])
+
+        for stream in expected_streams:
+            with self.subTest(stream=stream):
+
+                # get replication key for stream
+                replication_key = list(expected_replication_keys[stream])[0]
+
+                # get records
+                records = [record.get('data') for record in sync_records.get(stream).get('messages')
+                           if record.get('action') == 'upsert']
+
+                # verify if we get records in ASCENDING order:
+                #   every record's date should be lesser than the next record's date
+                for i in range(len(records) - 1):
+                    self.assertLessEqual(self.parse_date(records[i].get(replication_key)), self.parse_date(records[i+1].get(replication_key)))
+
+                # Verify the sync records respect the (simulated) bookmark value
+                for record in records:
+                    self.assertGreaterEqual(self.parse_date(record.get(replication_key)), self.parse_date(bookmark_with_lookback_window),
+                                            msg='The record does not respect the lookback window.')
+
+                # check if the 1st record is between lookback date and bookmark:
+                #   lookback_date <= record < bookmark (state file date when sync started)
+                self.assertLessEqual(self.parse_date(bookmark_with_lookback_window), self.parse_date(records[0].get(replication_key)))
+                self.assertLess(self.parse_date(records[0].get(replication_key)), self.parse_date(bookmark))
+
+    def test_run(self):
+        # run with REST API
+        self.salesforce_api = 'REST'
+        self.run_test()
+
+        # run with BULK API
+        self.salesforce_api = 'BULK'
+        self.run_test()
diff --git a/tests/unittests/test_lookback_window.py b/tests/unittests/test_lookback_window.py
new file mode 100644
index 00000000..2dd55850
--- /dev/null
+++ b/tests/unittests/test_lookback_window.py
@@ -0,0 +1,97 @@
+from tap_salesforce.salesforce import Salesforce
+import unittest
+
+start_date = "2022-05-02T00:00:00.000000Z"
+bookmark = "2022-05-23T00:00:00.000000Z"
+
+catalog_entry = {
+    "tap_stream_id": "Test",
+    "metadata": [
+        {
+            "breadcrumb": [],
+            "metadata": {
+                "replication-method": "INCREMENTAL",
+                "replication-key": "SystemModstamp"
+            }
+        }
+    ]
+}
+
+TEST_STATE = {
+    "bookmarks": {
+        "Test": {
+            "SystemModstamp": bookmark
+        }
+    }
+}
+
+TEST_LOOKBACK_WINDOW = 60
+
+
+class SalesforceGetStartDateTests(unittest.TestCase):
+    """Test all of the combinations of having a bookmark or not and a
+    lookback_window or not
+
+    These tests set up a minimal `Salesforce` object and call
+    `Salesforce.get_start_date()` for each of the following scenarios
+
+    | Has Lookback | Has bookmark | Expectation       |
+    |--------------+--------------+-------------------|
+    | No           | No           | start date        |
+    | No           | Yes          | bookmark          |
+    | Yes          | No           | start date        |
+    | Yes          | Yes          | adjusted bookmark |
+    """
+    def test_no_lookback_no_bookmark_returns_start_date(self):
+        sf_obj = Salesforce(
+            default_start_date=start_date
+        )
+
+        expected = start_date
+        actual = sf_obj.get_start_date(
+            {},
+            catalog_entry
+        )
+
+        self.assertEqual(expected, actual)
+
+    def test_no_lookback_yes_bookmark_returns_bookmark(self):
+        sf_obj = Salesforce(
+            default_start_date=start_date
+        )
+
+        expected = bookmark
+        actual = sf_obj.get_start_date(
+            TEST_STATE,
+            catalog_entry
+        )
+
+        self.assertEqual(expected, actual)
+
+    def test_yes_lookback_no_bookmark_returns_start_date(self):
+        sf_obj = Salesforce(
+            default_start_date=start_date,
+            lookback_window=TEST_LOOKBACK_WINDOW,
+        )
+
+        expected = start_date
+        actual = sf_obj.get_start_date(
+            {},
+            catalog_entry
+        )
+
+        self.assertEqual(expected, actual)
+
+    def test_yes_lookback_yes_bookmark_returns_adjusted_bookmark(self):
+        sf_obj = Salesforce(
+            default_start_date=start_date,
+            lookback_window=TEST_LOOKBACK_WINDOW
+        )
+
+        expected = "2022-05-22T23:59:00.000000Z"
+        actual = sf_obj.get_start_date(
+            TEST_STATE,
+            catalog_entry
+        )
+
+        self.assertEqual(expected, actual)