diff --git a/README.md b/README.md index 39c0f28..1aa0f29 100644 --- a/README.md +++ b/README.md @@ -126,12 +126,9 @@ It might be important to understand how this works if there is an unusual situat If you wish to completely reprocess and submit a month's data from log files you can: -1. Manually send a DELETE request to the hub for an id to remove a report. -2. Remove the state data from the json file for a particular year-month. -3. Remove the appropriate month's sqlite database from the file system -4. Reprocess the month. If it's after the month, use *year_month* for the months report you'd like. - -These 4 steps can be automatically done by passing *clean_for_rerun=True*. Note: If a report id exists in the state file the DELETE request will be called regardless of the *upload_to_hub* flag +1. Set *clean_for_rerun=True* in the config file +2. Or, by passing *clean_for_rerun=True* when calling main.py +3. Reprocess the month. If it's after the month, use *year_month* for the months report you'd like. ```CLEAN_FOR_RERUN=True ./main.py``` diff --git a/config/config.py b/config/config.py index c71666e..65f5645 100644 --- a/config/config.py +++ b/config/config.py @@ -247,11 +247,10 @@ def filenames_to_process(self): def delete_log_processed_date(self): # clean up data for this period, so it can be re-run if self.year_month in self.state_dict: - if 'id' in self.state_dict[self.year_month]: - upload.delete_from_datacite(self.state_dict[self.year_month]['id']) self.log.info(f"Removing state: {self.year_month}") # remove the info from the state json - self.state_dict.pop(self.year_month) + if 'last_processed_day' in self.state_dict[self.year_month]: + self.state_dict[self.year_month].pop('last_processed_day') # delete the specific database for this time period my_file = f'state/counter_db_{self.year_month}.sqlite3' if os.path.exists(my_file): diff --git a/config/counter-processor-config.yaml b/config/counter-processor-config.yaml new file mode 100644 index 0000000..2614454 --- /dev/null +++ b/config/counter-processor-config.yaml @@ -0,0 +1,68 @@ +# currently no other option but to have daily logs and have year-month-day format in the name with +# 4-digit year and 2-digit month and day +# /usr/local/payara6/glassfish/domains/domain1/logs/counter_2019-01-11.log +#log_name_pattern: sample_logs/counter_(yyyy-mm-dd).log +log_name_pattern: /usr/local/payara6/glassfish/domains/domain1/logs/mdc/counter_(yyyy-mm-dd).log + +# path_types regular expressions allow matching to classify page urls as either an investigation or request +# based on specific URL structure for your system. +# Dataverse Note: the url matches on this does not include the query params, so dataset.xhtml\S+ will not match +path_types: + investigations: + - ^.*/dataset.xhtml\S*$ + - ^.*/file.xhtml\S*$ + - ^.*/api/datasets\S*$ + - ^.*/api/v1/datasets\S*$ + ## Below historic regex for testing + #- ^/api/datasets/[^\/]+$ + #- ^/api/versions/\d+$ + #- ^/stash/dataset/\S+$ + #- ^/stash/data_paper/\S+$ + requests: + - ^.*/api/access/datafile\S+$ + - ^.*/api/v1/access/datafile\S+$ + ## Below historic regex for testing + #- ^/api/datasets/[^\/]+/download$ + #- ^/api/versions/\d+/download$ + #- ^/api/downloads/\d+$ + #- ^/stash/downloads/download_resource/\d+$ + #- ^/stash/downloads/file_download/\d+$ + #- ^/stash/downloads/file_stream/\d+$ + #- ^/stash/downloads/async_request/\d+$ + #- ^/stash/share/\S+$ + +# Robots and machines urls are urls where the script can download a list of regular expressions to determine +# if something is a robot or machine user-agent. The text file has one regular expression per line +robots_url: https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/master/user-agents/lists/robot.txt +machines_url: https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/master/user-agents/lists/machine.txt + +# the year and month for the report you are creating. +year_month: 2019-01 + +# Don't put the filename extension, the code will tack on the tsv or json extension for you. +# Output formats are either tsv or json currently. TSV is currently broken until anyone accepts reports in that format. +# FIXME: "/tmp" is fine for a quick test but pick another directory that +# the "counter" user can write to and that the "dataverse" user can read from. +output_file: /tmp/make-data-count-report +output_format: json + +# the name of the platform that goes into your reports +# FIXME: Change "platform" to match the name of your Dataverse installation. Examples +# are "Harvard Dataverse" for Harvard University or "LibraData" for the University of Virginia. +platform: LibreScholar + +# Don't put your api token in here if you're going to commit it, but put in separate secrets.yaml in same +# directory as the config or else set a environment variable when starting up in order to override the key. +# yaml key/values set in secrets.yaml will override one from the main config. +hub_api_token: set_me_in_secrets +# the test metrics is only for testing +# hub_base_url: https://metrics.test.datacite.org +# FIXME: change "hub_base_url" to https://api.datacite.org (production) once you have credentials. +hub_base_url: https://api.test.datacite.org +# FIXME: Change "upload_to_hub" to True when you're ready. +upload_to_hub: False + +# only use this to simulate running on a date besides today +# simulate_date: 2019-01-12 + +maxmind_geoip_country_path: maxmind_geoip/GeoLite2-Country.mmdb diff --git a/config/prod_secrets.yaml b/config/prod_secrets.yaml new file mode 100644 index 0000000..6ef865b --- /dev/null +++ b/config/prod_secrets.yaml @@ -0,0 +1,2 @@ +# production account token +hub_api_token: eyJhbGciOiJSUzI1NiJ9.eyJ1aWQiOiJnZGNjLmhhcnZhcmQtZHYiLCJuYW1lIjoiV2luc2hpcCwgU3RldmVuIiwiZW1haWwiOiJzdGV2ZW5fd2luc2hpcEBpcS5oYXJ2YXJkLmVkdSIsInByb3ZpZGVyX2lkIjoiSEFSVkFSRFUiLCJjbGllbnRfaWQiOiJHRENDLkhBUlZBUkQtRFYiLCJyb2xlX2lkIjoiY2xpZW50X2FkbWluIiwiYmV0YV90ZXN0ZXIiOiJmYWxzZSIsImhhc19vcmNpZF90b2tlbiI6ZmFsc2UsImF1ZCI6InByb2R1Y3Rpb24iLCJpYXQiOjE3MTEzNzU5NzYsImV4cCI6IjE3NDI5MTE5NzYifQ.SfB8c6VKUSr0aZfI9sKn5l_iIvyZNWXgTV5oTO03feidq24JcmE6QgvaJa93c0GANNYjN7tr5xU20RyLBy-OQq_1ceFCtwYivXPZI8Qx-KEnD4m0bdjjdLctg_-z36KADyQir2nkvbBEMchX5h02WnNPgSyz9_vV2MYF1mOtMJAIftb-WwQvceUBPJgxPewVcmA6HI5OikbCZS5EtI-F7Z1OmQwKIxt3_PodmcLFJvpkEYuiueN6FMD4tcsGSmtwqKN9fI3nzed27yfjay1GTwF95kZdlSYZDbuG2HnF1vUHpqgUI3UwNwDE3s_YNoC9ljvAxr3qxX41eY9kIgIpVWjgzO4maRElJWNQC2Gw_HRNHkXWlZs1mjz55HeNGlpSBq1DQ_3LmOyJC5HS_LHy-e-qTtZWmujfecK_Xsf8ZrV_zIcyzHrKmEVfUeWd5ruRFHFQmZPJTKI4JGQpxbme_CCSphjcTGFGxcxNUj5TjZ6mvmhNDW2BtTt80J_4kWDpmCA3QbApfvg7l8wGpjBPnhlFW5J8BgQf0w4GjoE0hxiT4E_Sp16rWYPEPe-90DSNbhFC9Z6EONEfS_wfJkTW7WxcLsfRgjpogykYDrHJE2UhvZw0kQj-InLxp4FyG9kpFAhVtMrXUL6kY3YFeY7-72jlbQZyKL-lga5Cq34zg4c diff --git a/maxmind_geoip/GeoLite2-Country_20191217/COPYRIGHT.txt b/maxmind_geoip/GeoLite2-Country_20191217/COPYRIGHT.txt new file mode 100644 index 0000000..5f38895 --- /dev/null +++ b/maxmind_geoip/GeoLite2-Country_20191217/COPYRIGHT.txt @@ -0,0 +1 @@ +Database and Contents Copyright (c) 2019 MaxMind, Inc. diff --git a/maxmind_geoip/GeoLite2-Country_20191217/GeoLite2-Country.mmdb b/maxmind_geoip/GeoLite2-Country_20191217/GeoLite2-Country.mmdb new file mode 100644 index 0000000..22b5ae3 Binary files /dev/null and b/maxmind_geoip/GeoLite2-Country_20191217/GeoLite2-Country.mmdb differ diff --git a/maxmind_geoip/GeoLite2-Country_20191217/LICENSE.txt b/maxmind_geoip/GeoLite2-Country_20191217/LICENSE.txt new file mode 100644 index 0000000..8216b8f --- /dev/null +++ b/maxmind_geoip/GeoLite2-Country_20191217/LICENSE.txt @@ -0,0 +1,3 @@ +This work is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/4.0/. + +This database incorporates GeoNames [http://www.geonames.org] geographical data, which is made available under the Creative Commons Attribution 3.0 License. To view a copy of this license, visit http://www.creativecommons.org/licenses/by/3.0/us/. diff --git a/upload/upload.py b/upload/upload.py index 238280f..3535233 100644 --- a/upload/upload.py +++ b/upload/upload.py @@ -25,6 +25,7 @@ def save_response(response): # this method is a retry since the datacite api randomly gives many 500 errors def retry_if_500(method, url, data, headers): + print(f'Uploading {method} {url} ...') for attempt in range(60): response = getattr(requests, method)(url, data=gzip.compress(data.encode()), headers=headers) if response.status_code < 500 or attempt == 59: @@ -85,14 +86,3 @@ def send_to_datacite(): sys.exit(1) else: print(f'Submitted ID: {my_id}') - -def delete_from_datacite(id): - headers = { - 'Authorization': f'Bearer {config.Config().hub_api_token}' - } - my_url = urljoin(config.Config().hub_base_url, f'reports/{pathname2url(id)}') - response = retry_if_500(method='delete', url=my_url, data='', headers=headers) - if response.status_code < 200 or response.status_code > 299: - print(f'Delete ID: {id}. Expected to get 204, but got code {response.status_code}') - else: - print(f'Deleted ID: {id}')