diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 5b89bd3..608813f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -45,7 +45,7 @@ plugins: syrupy-4.6.1, cov-4.1.0 collected 14 items tests/test_file_helper.py . [ 7%] -tests/test_json_generator.py ss. [ 28%] +tests/test_meta_generator.py ... [ 28%] tests/test_json_support.py ... [ 50%] tests/test_metadata.py ... [ 71%] tests/test_misc.py .. [ 85%] diff --git a/README.md b/README.md index fad9aec..eb3bb41 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ You can run `python3 --version` to check the version of Python installed. As a general practice, it is recommended to use a virtual environment for the installation. ```shell -python3.9 -m venv virtenv +python3.11 -m venv virtenv source virtenv/bin/activate ``` @@ -52,7 +52,7 @@ The package includes the following CLI programs: | Program | Description | | ------- |------------------------------------------------| -| [`pbp-json-gen`](https://docs.mbari.org/pbp/pbp-json-gen/) | Generate JSON files with audio metadata. | +| [`pbp-meta-gen`](https://docs.mbari.org/pbp/pbp-meta-gen/) | Generate JSON files with audio metadata. | | [`pbp`](https://docs.mbari.org/pbp/pbp/) | Main HMB generation program. | | [`pbp-cloud`](https://docs.mbari.org/pbp/pbp-cloud/) | Program for cloud based processing. | | [`pbp-plot`](https://docs.mbari.org/pbp/pbp-plot/) | Utility program to plot resulting HMB product. | diff --git a/justfile b/justfile index 106bca8..49eaafe 100644 --- a/justfile +++ b/justfile @@ -28,7 +28,7 @@ to-gizo user="carueda" server="gizo.shore.mbari.org": tgz tgz: #!/usr/bin/env bash HASH=$(git rev-parse --short HEAD) - git archive ${HASH} -o pbp_${HASH}.tgz --prefix=pbp/ + git archive ${HASH} -o pbp_${HASH}.tgz --prefixes=pbp/ # Run main (on gizo) main-gizo date="20220902" output_dir="/PAM_Analysis/pypam-space/test_output/daily": @@ -38,7 +38,7 @@ main-gizo date="20220902" output_dir="/PAM_Analysis/pypam-space/test_output/dail --voltage-multiplier=3 \ --sensitivity-uri=misc/icListen1689_sensitivity_hms256kHz.nc \ --subset-to 10 100000 \ - --audio-path-map-prefix="s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022" \ + --audio-path-map-prefixes="s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022" \ --output-dir={{output_dir}} # Run main (on gizo) with some initial test jsons @@ -50,7 +50,7 @@ main-gizo-test *more_args="": --sensitivity-uri=misc/icListen1689_sensitivity_hms256kHz.nc \ --subset-to 10 100000 \ --audio-base-dir=tests/wav \ - --audio-path-map-prefix="s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022" \ + --audio-path-map-prefixes="s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022" \ --output-dir=/PAM_Analysis/pypam-space/test_output/daily \ {{more_args}} @@ -73,7 +73,7 @@ main-gizo-multiple-days year month *days="": --voltage-multiplier=3 \ --sensitivity-uri=misc/icListen1689_sensitivity_hms256kHz.nc \ --subset-to 10 100000 \ - --audio-path-map-prefix="s3://pacific-sound-256khz-{{year}}~file:///PAM_Archive/{{year}}" \ + --audio-path-map-prefixes="s3://pacific-sound-256khz-{{year}}~file:///PAM_Archive/{{year}}" \ --output-dir="$output_dir" \ > "$out" 2>&1 & done @@ -96,7 +96,7 @@ main-mb05 *more_args="": --variable-attrs metadata/mb05/variableAttributes_MB05.yaml \ --subset-to 10 24000 \ --output-dir=NB_SPACE/OUTPUT \ - --output-prefix=MB05_ \ + --output-prefixes=MB05_ \ --download-dir=NB_SPACE/DOWNLOADS \ --assume-downloaded-files \ --retain-downloaded-files \ @@ -123,7 +123,7 @@ main-nrs11 date='20200101' *more_args='': --voltage-multiplier=2.5 \ --sensitivity-uri="$WS/NRS11_H5R6_sensitivity_hms5kHz.nc" \ --subset-to 10 2000 \ - --output-prefix=NRS11_ \ + --output-prefixes=NRS11_ \ --output-dir="$WS/OUTPUT" \ --download-dir="$WS/DOWNLOADS" \ --retain-downloaded-files \ @@ -149,7 +149,7 @@ main-nrs11-multiple-days year month *days="": --voltage-multiplier=2.5 \ --sensitivity-uri="$WS/NRS11_H5R6_sensitivity_hms5kHz.nc" \ --subset-to 10 2000 \ - --output-prefix=NRS11_ \ + --output-prefixes=NRS11_ \ --output-dir="$WS/OUTPUT" \ --download-dir="$WS/DOWNLOADS" \ --retain-downloaded-files \ diff --git a/pbp/file_helper.py b/pbp/file_helper.py index 75ba205..9a26e6f 100644 --- a/pbp/file_helper.py +++ b/pbp/file_helper.py @@ -207,7 +207,7 @@ def __init__( Prefix mapping to get actual audio uri to be used. Example: `s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022` :param audio_path_prefix: - Ad hoc path prefix for sound file locations, e.g. "/Volumes" + Ad hoc path prefixes for sound file locations, e.g. "/Volumes" :param segment_size_in_mins: The size of each audio segment to extract, in minutes. By default, 1. :param s3_client: diff --git a/pbp/json_generator/gen_iclisten.py b/pbp/json_generator/gen_iclisten.py deleted file mode 100644 index 7eba49d..0000000 --- a/pbp/json_generator/gen_iclisten.py +++ /dev/null @@ -1,222 +0,0 @@ -# pbp, Apache License 2.0 -# Filename: metadata/generator/gen_iclisten.py -# Description: Captures ICListen wav metadata in a pandas dataframe from either a local directory or S3 bucket. - -import re -from datetime import timedelta -from datetime import datetime -from typing import List - -import boto3 - -import pandas as pd -from pathlib import Path -from progressbar import progressbar -import pbp.json_generator.utils as utils -from pbp.json_generator.corrector import MetadataCorrector -from pbp.json_generator.metadata_extractor import GenericWavFile -from pbp.json_generator.gen_abstract import MetadataGeneratorAbstract - - -class IcListenMetadataGenerator(MetadataGeneratorAbstract): - log_prefix = None - - def __init__( - self, - log, # : loguru.Logger, - uri: str, - json_base_dir: str, - start: datetime, - end: datetime, - prefix: List[str], - seconds_per_file: float = 600.0, - ): - """ - Captures ICListen wav metadata in a pandas dataframe from either a local directory or S3 bucket. - :param uri: - The local directory or S3 bucket that contains the wav files - :param json_base_dir: - The local directory to store the metadata - :param start: - The start date to search for wav files - :param end: - The end date to search for wav files - :param prefix: - The search pattern to match the wav files, e.g. 'MARS' for MARS_YYYYMMDD_HHMMSS.wav - :param seconds_per_file: - The number of seconds per file expected in a wav file to check for missing data. If 0, then no check is done. - :return: - """ - super().__init__(log, uri, json_base_dir, prefix, start, end, seconds_per_file) - self.log_prefix = f"{self.__class__.__name__} {start:%Y%m%d}" - - def run(self): - self.log.info( - f"{self.log_prefix} Generating metadata for {self.start} to {self.end}..." - ) - - bucket_name, prefix, scheme = utils.parse_s3_or_gcp_url(self.audio_loc) - - # gs is not supported for icListen - if scheme == "gs": - self.log.error( - f"{self.log_prefix} GS is not supported for icListen audio files" - ) - return - - # Run for each day in the range - for day in pd.date_range(self.start, self.end, freq="D"): - try: - self.df = None - self.log.info( - f"{self.log_prefix} Searching in {self.audio_loc}/*.wav for wav files that match the search pattern {self.prefix}* ..." - ) - - wav_files = [] - - def check_file(f: str, f_start_dt: datetime, f_end_dt: datetime): - """ - Check if the file matches the search pattern and is within the start and end dates - :param f: - The path to the file - :param f_start_dt: - The start date to check - :param f_end_dt: - The end date to check - :return: - """ - - f_path = Path(f) - f_wav_dt = None - - for s in self.prefix: - # see if the file is a regexp match to search - rc = re.search(s, f_path.stem) - - if rc and rc.group(0): - try: - # MARS file date is in the filename MARS_YYYYMMDD_HHMMSS.wav - f_path_dt = datetime.strptime( - f_path.stem, f"{s}%Y%m%d_%H%M%S" - ) - - if f_start_dt <= f_path_dt <= f_end_dt: - self.log.info( - f"{self.log_prefix} Found {f_path.name} to process" - ) - wav_files.append( - GenericWavFile(self.log, f, f_path_dt) - ) - f_wav_dt = f_path_dt - except ValueError: - self.log.error( - f"{self.log_prefix} Could not parse {f_path.name}" - ) - return None - - return f_wav_dt - - # Set the start and end dates to 1 hour before and after the start and end dates - start_dt = day - timedelta(hours=1) - end_dt = day + timedelta(days=1) - - if scheme == "file": - wav_path = Path(self.audio_loc) - for filename in progressbar( - sorted(wav_path.rglob("*.wav")), prefix="Searching : " - ): - check_file(filename.as_posix(), start_dt, end_dt) - if scheme == "s3": - client = boto3.client("s3") - for day_hour in pd.date_range(start=start_dt, end=end_dt, freq="h"): - bucket = f"{bucket_name}-{day_hour.year:04d}" - prefix = f"{day_hour.month:02d}/MARS_{day_hour.year:04d}{day_hour.month:02d}{day_hour.day:02d}_{day_hour.hour:02d}" - paginator = client.get_paginator("list_objects") - - operation_parameters = {"Bucket": bucket, "Prefix": prefix} - page_iterator = paginator.paginate(**operation_parameters) - self.log.info( - f"{self.log_prefix} Searching in bucket: {bucket} prefix: {prefix}" - ) - # list the objects in the bucket - # loop through the objects and check if they match the search pattern - for page in page_iterator: - if "Contents" not in page: - self.log.info( - f"{self.log_prefix} No data found in {bucket}" - ) - break - - for obj in page["Contents"]: - key = obj["Key"] - wav_dt = check_file( - f"s3://{bucket}/{key}", start_dt, end_dt - ) - if wav_dt is None: - continue - if wav_dt > end_dt or wav_dt < start_dt: - break - - self.log.info( - f"{self.log_prefix} Found {len(wav_files)} files to process that cover the period {start_dt} - {end_dt}" - ) - - # sort the files by start time - wav_files.sort(key=lambda x: x.start) - - # create a dataframe from the wav files - self.log.info( - f"{self.log_prefix} Creating dataframe from {len(wav_files)} files spanning {wav_files[0].start} to {wav_files[-1].start}..." - ) - for wc in wav_files: - df_wav = wc.to_df() - - # concatenate the metadata to the dataframe - self.df = pd.concat([self.df, df_wav], axis=0) - - self.log.debug(f"{self.log_prefix} Running metadata corrector for {day}") - corrector = MetadataCorrector( - self.log, - self.df, - self.json_base_dir, - day, - utils.InstrumentType.NRS, - True, - self.seconds_per_file, - ) - corrector.run() - - except Exception as ex: - self.log.exception(str(ex)) - - -if __name__ == "__main__": - from pbp.logging_helper import create_logger - - log_dir = Path("tests/log") - json_dir = Path("tests/json/mars") - log_dir.mkdir(exist_ok=True, parents=True) - json_dir.mkdir(exist_ok=True, parents=True) - - log = create_logger( - log_filename_and_level=( - f"{log_dir}/test_iclisten_metadata_generator.log", - "INFO", - ), - console_level="INFO", - ) - - start = datetime(2023, 7, 18, 0, 0, 0) - end = datetime(2023, 7, 18, 0, 0, 0) - - # If only running one day, use a single generator - generator = IcListenMetadataGenerator( - log, - uri="s3://pacific-sound-256khz", - json_base_dir=json_dir.as_posix(), - prefix=["MARS"], - start=start, - end=end, - seconds_per_file=600, - ) - generator.run() diff --git a/pbp/json_generator/utils.py b/pbp/json_generator/utils.py deleted file mode 100644 index 2cc63f4..0000000 --- a/pbp/json_generator/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -# pbp, Apache License 2.0 -# Filename: json_generator/utils.py -# Description: Utility functions for parsing S3, GS or local file urls and defining sound instrument types for metadata generation -from typing import Tuple -from urllib.parse import urlparse - - -class InstrumentType: - NRS = "NRS" - ICLISTEN = "ICLISTEN" - SOUNDTRAP = "SOUNDTRAP" - - -def parse_s3_or_gcp_url(url) -> Tuple[str, str, str]: - """ - Parse the S3, GS of local file url - :param url: - :return: - """ - parsed_url = urlparse(url) - bucket = parsed_url.netloc - prefix = parsed_url.path.lstrip("/") - if parsed_url.scheme == "file": - bucket = "" - prefix = parsed_url.path - return bucket, prefix, parsed_url.scheme diff --git a/pbp/main_args.py b/pbp/main_args.py index 5278727..3dca3b1 100644 --- a/pbp/main_args.py +++ b/pbp/main_args.py @@ -69,7 +69,7 @@ def parse_arguments(): ) parser.add_argument( - "--audio-path-map-prefix", + "--audio-path-map-prefixes", type=str, metavar="from~to", default="", @@ -78,12 +78,12 @@ def parse_arguments(): ) parser.add_argument( - "--audio-path-prefix", + "--audio-path-prefixes", type=str, metavar="dir", default="", - help="Ad hoc path prefix for sound file location, for example, /Volumes." - " By default, no prefix applied.", + help="Ad hoc path prefixes for sound file location, for example, /Volumes." + " By default, no prefixes applied.", ) parser.add_argument( @@ -128,11 +128,11 @@ def parse_arguments(): ) parser.add_argument( - "--output-prefix", + "--output-prefixes", type=str, - metavar="prefix", + metavar="prefixes", default="milli_psd_", - help="Output filename prefix", + help="Output filename prefixes", ) parser.add_argument( diff --git a/pbp/main_cloud.py b/pbp/main_cloud.py index 62c4836..55276a0 100644 --- a/pbp/main_cloud.py +++ b/pbp/main_cloud.py @@ -11,13 +11,13 @@ # DATE: (Required) # The date to process. Format: "YYYYMMDD". # S3_JSON_BUCKET_PREFIX: (Optional) -# Bucket prefix to be used to locate the YYYYMMDD.json file +# Bucket prefixes to be used to locate the YYYYMMDD.json file # By default, "s3://pacific-sound-metadata/256khz". # S3_OUTPUT_BUCKET: (Optional) # The bucket to write the generated output to. # Typically this is to be provided but it is optional to facilitate testing. # OUTPUT_PREFIX: (Optional) -# Output filename prefix. By default, "milli_psd_". +# Output filename prefixes. By default, "milli_psd_". # The resulting file will be named as .nc. # GLOBAL_ATTRS_URI: (Optional) # URI of JSON file with global attributes to be added to the NetCDF file. @@ -73,7 +73,7 @@ def main(): # The date to process. Format: "YYYYMMDD" date = os.environ["DATE"] - # Bucket prefix to be used to locate the YYYYMMDD.json file + # Bucket prefixes to be used to locate the YYYYMMDD.json file json_bucket_prefix = os.getenv( "S3_JSON_BUCKET_PREFIX", "s3://pacific-sound-metadata/256khz" ) diff --git a/pbp/main_json_generator.py b/pbp/main_meta_generator.py similarity index 84% rename from pbp/main_json_generator.py rename to pbp/main_meta_generator.py index e4b7fb4..f877d38 100644 --- a/pbp/main_json_generator.py +++ b/pbp/main_meta_generator.py @@ -1,10 +1,10 @@ from datetime import datetime from pathlib import Path -from pbp.json_generator.gen_nrs import NRSMetadataGenerator -from pbp.json_generator.gen_iclisten import IcListenMetadataGenerator -from pbp.json_generator.gen_soundtrap import SoundTrapMetadataGenerator -from pbp.main_json_generator_args import parse_arguments +from pbp.meta_gen.gen_nrs import NRSMetadataGenerator +from pbp.meta_gen.gen_iclisten import IcListenMetadataGenerator +from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator +from pbp.main_meta_generator_args import parse_arguments # Some imports, in particular involving data processing, cause a delay that is # noticeable when just running the --help option. We get around this issue by @@ -38,7 +38,7 @@ def main(): log=log, uri=opts.uri, json_base_dir=json_dir.as_posix(), - prefix=opts.prefix, + prefixes=opts.prefixes, start=start, end=end, ) @@ -48,7 +48,7 @@ def main(): log=log, uri=opts.uri, json_base_dir=json_dir.as_posix(), - prefix=opts.prefix, + prefixes=opts.prefixes, start=start, end=end, ) @@ -59,7 +59,7 @@ def main(): log=log, uri=opts.uri, json_base_dir=json_dir.as_posix(), - prefix=opts.prefix, + prefixes=opts.prefixes, start=start, end=end, ) diff --git a/pbp/main_json_generator_args.py b/pbp/main_meta_generator_args.py similarity index 88% rename from pbp/main_json_generator_args.py rename to pbp/main_meta_generator_args.py index 2f4ec70..3f7bab3 100644 --- a/pbp/main_json_generator_args.py +++ b/pbp/main_meta_generator_args.py @@ -1,7 +1,7 @@ from argparse import ArgumentParser, RawTextHelpFormatter from pbp import get_pbp_version -from pbp.json_generator.utils import InstrumentType +from pbp.meta_gen.utils import InstrumentType def parse_arguments(): @@ -11,13 +11,13 @@ def parse_arguments(): ) example = """ Examples: - pbp-json-gen \\ + pbp-meta-gen \\ --json-base-dir=tests/json/nrs \\ --output-dir=output \\ --uri=s3://pacific-sound-ch01 \\ --start=20220902 \\ --end=20220902 \\ - --prefix=MARS \\ + --prefixes=MARS \\ --recorder=NRS """ @@ -80,12 +80,12 @@ def parse_arguments(): ) parser.add_argument( - "--prefix", + "--prefixes", type=str, required=True, nargs="+", - help="Prefix for search to match the audio files. Assumption is the prefix is separated by an " - "underscore, e.g. 'MARS_'.", + help="Prefix for search to match the audio files e.g. 'MARS_' for MARS_YYYYMMDD_HHMMSS.wav, '7000. ' for " + "7000.20220902.000000.wav", ) return parser.parse_args() diff --git a/pbp/json_generator/__init__.py b/pbp/meta_gen/__init__.py similarity index 100% rename from pbp/json_generator/__init__.py rename to pbp/meta_gen/__init__.py diff --git a/pbp/json_generator/gen_abstract.py b/pbp/meta_gen/gen_abstract.py similarity index 91% rename from pbp/json_generator/gen_abstract.py rename to pbp/meta_gen/gen_abstract.py index 5bf3b0f..a15df18 100644 --- a/pbp/json_generator/gen_abstract.py +++ b/pbp/meta_gen/gen_abstract.py @@ -13,7 +13,7 @@ def __init__( log, # : loguru.Logger, audio_loc: str, json_base_dir: str, - prefix: List[str], + prefixes: List[str], start: datetime, end: datetime, seconds_per_file: float = 0.0, @@ -25,8 +25,8 @@ def __init__( The local directory or cloud bucket that contains the wav files :param json_base_dir: The local directory to write the json files to - :param prefix: - The search pattern to match the wav files, e.g. 'MARS' + :param prefixes: + The search patterns to match the wav files, e.g. 'MARS' :param start: The start date to search for wav files :param end: @@ -41,7 +41,7 @@ def __init__( self.df = pd.DataFrame() self.start = start self.end = end - self.prefix = prefix + self.prefixes = prefixes self._log = log self._seconds_per_file = None if seconds_per_file == 0 else seconds_per_file except Exception as e: diff --git a/pbp/meta_gen/gen_iclisten.py b/pbp/meta_gen/gen_iclisten.py new file mode 100644 index 0000000..4da8366 --- /dev/null +++ b/pbp/meta_gen/gen_iclisten.py @@ -0,0 +1,179 @@ +# pbp, Apache License 2.0 +# Filename: metadata/generator/gen_iclisten.py +# Description: Captures ICListen wav metadata in a pandas dataframe from either a local directory or S3 bucket. + +from datetime import timedelta +from datetime import datetime +from typing import List + +import boto3 + +import pandas as pd +from pathlib import Path +from progressbar import progressbar +import pbp.meta_gen.utils as utils +from pbp.meta_gen.json_generator import JsonGenerator +from pbp.meta_gen.meta_reader import GenericWavFile +from pbp.meta_gen.gen_abstract import MetadataGeneratorAbstract + + +class IcListenMetadataGenerator(MetadataGeneratorAbstract): + log_prefix = None + + def __init__( + self, + log, # : loguru.Logger, + uri: str, + json_base_dir: str, + start: datetime, + end: datetime, + prefixes: List[str], + seconds_per_file: float = 600.0, + ): + """ + Captures ICListen wav metadata in a pandas dataframe from either a local directory or S3 bucket. + :param uri: + The local directory or S3 bucket that contains the wav files + :param json_base_dir: + The local directory to store the metadata + :param start: + The start date to search for wav files + :param end: + The end date to search for wav files + :param prefixes: + The search pattern to match the wav files, e.g. 'MARS_' for MARS_YYYYMMDD_HHMMSS.wav + :param seconds_per_file: + The number of seconds per file expected in a wav file to check for missing data. If 0, then no check is done. + :return: + """ + super().__init__(log, uri, json_base_dir, prefixes, start, end, seconds_per_file) + self.log_prefix = f"{self.__class__.__name__} {start:%Y%m%d}" + + def run(self): + self.log.info( f"{self.log_prefix} Generating metadata for {self.start} to {self.end}..." ) + + bucket_name, prefix, scheme = utils.parse_s3_or_gcp_url(self.audio_loc) + + # gs is not supported for icListen + if scheme == "gs": + self.log.error(f"{self.log_prefix} GS is not supported for icListen audio files") + return + + # Run for each day in the range + for day in pd.date_range(self.start, self.end, freq="D"): + try: + self.df = None + for s in self.prefixes: + self.log.info(f"{self.log_prefix} Searching in {self.audio_loc}/*.wav " + f"for wav files that match the search patterns {s}* ...") + + wav_files = [] + + # Set the start and end dates to 1 hour before and after the start and end dates + start_dt = day - timedelta(hours=1) + end_dt = day + timedelta(days=1) + + if scheme == "file": + wav_path = Path(self.audio_loc.split("file://")[-1]) + for filename in progressbar( sorted(wav_path.rglob("*.wav")), prefix="Searching : "): + wav_dt = utils.get_datetime(filename, self.prefixes) + if wav_dt and start_dt <= wav_dt <= end_dt: + self.log.info(f"Found file {filename} with timestamp {wav_dt}") + wav_files.append(GenericWavFile(self.log, filename, wav_dt)) + + if scheme == "s3": + client = boto3.client("s3") + for day_hour in pd.date_range(start=start_dt, end=end_dt, freq="h"): + bucket = f"{bucket_name}-{day_hour.year:04d}" + + for p in self.prefixes: + prefix = f"{day_hour.month:02d}/{p}{day_hour.year:04d}{day_hour.month:02d}{day_hour.day:02d}_{day_hour.hour:02d}" + paginator = client.get_paginator("list_objects") + + operation_parameters = {"Bucket": bucket, "Prefix": prefix} + page_iterator = paginator.paginate(**operation_parameters) + self.log.info(f"{self.log_prefix} Searching in bucket: {bucket} prefixes: {self.prefixes}") + + # list the objects in the bucket + # loop through the objects and check if they match the search pattern + for page in page_iterator: + if "Contents" not in page: + self.log.info(f"{self.log_prefix} No data found in {bucket}") + break + + for obj in page["Contents"]: + key = obj["Key"] + wav_dt = utils.get_datetime(f"s3://{bucket}/{key}", self.prefixes) + if wav_dt is None: + continue + if start_dt <= wav_dt <= end_dt: + self.log.info(f'Found {f"s3://{bucket}/{key}"} with timestamp {wav_dt}') + wav_files.append(GenericWavFile(self.log, f"s3://{bucket}/{key}", wav_dt)) + + self.log.info(f"{self.log_prefix} Found {len(wav_files)} files to process that " + f"cover the period {start_dt} - {end_dt}") + + if len(wav_files) == 0: + self.log.info(f"{self.log_prefix} No files found to process that " + f"cover the period {start_dt} - {end_dt}") + return + + # sort the files by start time + wav_files.sort(key=lambda x: x.start) + + # create a dataframe from the wav files + self.log.info(f"{self.log_prefix} Creating dataframe from {len(wav_files)} files " + f"spanning {wav_files[0].start} to {wav_files[-1].start}...") + + for wc in wav_files: + df_wav = wc.to_df() + + # concatenate the metadata to the dataframe + self.df = pd.concat([self.df, df_wav], axis=0) + + self.log.debug(f"{self.log_prefix} Running metadata json_gen for {day}") + json_gen = JsonGenerator( + self.log, + self.df, + self.json_base_dir, + day, + utils.InstrumentType.NRS, + True, + self.seconds_per_file, + ) + json_gen.run() + + except Exception as ex: + self.log.exception(str(ex)) + + +if __name__ == "__main__": + from pbp.logging_helper import create_logger + + log_dir = Path("tests/log") + json_dir = Path("tests/json/mars") + log_dir.mkdir(exist_ok=True, parents=True) + json_dir.mkdir(exist_ok=True, parents=True) + + log = create_logger( + log_filename_and_level=( + f"{log_dir}/test_iclisten_metadata_generator.log", + "INFO", + ), + console_level="INFO", + ) + + start = datetime(2023, 7, 18, 0, 0, 0) + end = datetime(2023, 7, 18, 0, 0, 0) + + # If only running one day, use a single generator + generator = IcListenMetadataGenerator( + log, + uri="s3://pacific-sound-256khz", + json_base_dir=json_dir.as_posix(), + prefixes=["MARS_"], + start=start, + end=end, + seconds_per_file=600, + ) + generator.run() diff --git a/pbp/json_generator/gen_nrs.py b/pbp/meta_gen/gen_nrs.py similarity index 57% rename from pbp/json_generator/gen_nrs.py rename to pbp/meta_gen/gen_nrs.py index 04621e3..3002363 100644 --- a/pbp/json_generator/gen_nrs.py +++ b/pbp/meta_gen/gen_nrs.py @@ -12,10 +12,12 @@ import pandas as pd from pathlib import Path from progressbar import progressbar -from pbp.json_generator.corrector import MetadataCorrector -from pbp.json_generator.metadata_extractor import FlacFile, GenericWavFile as WavFile -from pbp.json_generator.gen_abstract import MetadataGeneratorAbstract -from pbp.json_generator.utils import parse_s3_or_gcp_url, InstrumentType + +from pbp.meta_gen import utils +from pbp.meta_gen.json_generator import JsonGenerator +from pbp.meta_gen.meta_reader import FlacFile, GenericWavFile as WavFile +from pbp.meta_gen.gen_abstract import MetadataGeneratorAbstract +from pbp.meta_gen.utils import parse_s3_or_gcp_url, InstrumentType class NRSMetadataGenerator(MetadataGeneratorAbstract): @@ -26,7 +28,7 @@ def __init__( json_base_dir: str, start: datetime, end: datetime, - prefix: List[str], + prefixes: List[str], seconds_per_file: float = 14400.0, ): """ @@ -39,13 +41,13 @@ def __init__( The start date to search for flac files :param end: The end date to search for flac files - :param prefix: - The search pattern to match the flac files, e.g. 'MARS' for MARS_YYYYMMDD_HHMMSS.flac + :param prefixes: + The search pattern to match the flac files, e.g. 'MARS_' for MARS_YYYYMMDD_HHMMSS.flac :param seconds_per_file: The number of seconds per file expected in a flac/wav file to check for missing data. If 0, then no check is done. :return: """ - super().__init__(log, uri, json_base_dir, prefix, start, end, seconds_per_file) + super().__init__(log, uri, json_base_dir, prefixes, start, end, seconds_per_file) def run(self): self.log.info(f"Generating metadata for {self.start} to {self.end}...") @@ -57,47 +59,9 @@ def run(self): self.log.error("S3 is not supported for NRS audio files") return - def parse_filename(f: str) -> datetime or None: - """ - Check if the file matches the search pattern and is within the start and end dates - :param f: - The path to the file - :return: The beginning recording time of the file - """ - f_path = Path(f) - f_path_dt = None - - for s in self.prefix: - # see if the file is a regexp match to search - rc = re.search(s, f_path.stem) - - if rc and rc.group(0): - try: - pattern_date = re.compile( - r"(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})" - ) # 20191231_230836 - search = pattern_date.search(f_path.stem) - if search: - match = search.groups() - year, month, day, hour, minute, second = map(int, match) - if second == 60: # this is a bug in the flac files names - second = 59 - f_path_dt = datetime(year, month, day, hour, minute, second) - return f_path_dt - else: - self.log.error(f"Could not parse {f_path.name}") - return None - except ValueError: - self.log.error(f"Could not parse {f_path.name}") - return None - - return f_path_dt - sound_files = [] self.df = None - self.log.info( - f"Searching in {self.audio_loc}/ for files that match the search pattern {self.prefix}* ..." - ) + self.log.info(f"Searching in {self.audio_loc}/ for files that match the search pattern {self.prefixes}* ...") # set the window to 1 flac file to account for any missing data minutes_window = int(self.seconds_per_file / 60) @@ -108,22 +72,18 @@ def parse_filename(f: str) -> datetime or None: if scheme == "file" or scheme == "": sound_path = Path(f"/{bucket}/{prefix}") - # First search for flac files - for filename in progressbar( - sorted(sound_path.rglob("*.flac")), prefix="Searching : " - ): - flac_dt = parse_filename(filename) - if start_dt <= flac_dt <= end_dt: - self.log.info(f"Found file {filename} with timestamp {flac_dt}") - sound_files.append(FlacFile(self.log, str(filename), flac_dt)) - # Next search for wav files - for filename in progressbar( - sorted(sound_path.rglob("*.wav")), prefix="Searching : " - ): - wav_dt = parse_filename(filename) - if start_dt <= wav_dt <= end_dt: - self.log.info(f"Found file {filename} with timestamp {wav_dt}") - sound_files.append(WavFile(self.log, str(filename), wav_dt)) + file_extensions = ["*.flac", "*.wav"] + for ext in file_extensions: + for filename in progressbar( + sorted(sound_path.rglob(ext)), prefix="Searching : " + ): + f_dt = utils.get_datetime(filename, self.prefixes) + if start_dt <= f_dt <= end_dt: + self.log.info(f"Found file {filename} with timestamp {f_dt}") + if ext == "*.flac": + sound_files.append(FlacFile(self.log, str(filename), f_dt)) + if ext == "*.wav": + sound_files.append(WavFile(self.log, str(filename), f_dt)) if scheme == "gs": client = storage.Client.create_anonymous_client() @@ -133,12 +93,17 @@ def parse_filename(f: str) -> datetime or None: # data is organized in a flat filesystem, so there are no optimizations here for querying blobs = bucket_obj.list_blobs(prefix=prefix) for i, blob in enumerate(blobs): - self.log.info(f"Processing {blob.name}") f_path = f"gs://{bucket}/{blob.name}" - f_dt = parse_filename(f_path) + self.log.info(f"Found {f_path}") + f_dt = utils.get_datetime(f_path, self.prefixes) + if f_dt is None: + continue if start_dt <= f_dt <= end_dt: self.log.info(f"Found file {blob.name} with timestamp {f_dt}") - sound_files.append(FlacFile(self.log, f_path, f_dt)) + if re.search(r"\.flac$", blob.name): + sound_files.append(FlacFile(self.log, f_path, f_dt)) + if re.search(r"\.wav$", blob.name): + sound_files.append(WavFile(self.log, f_path, f_dt)) # delay to avoid 400 error if i % 100 == 0: self.log.info(f"{i} files processed") @@ -146,9 +111,7 @@ def parse_filename(f: str) -> datetime or None: if f_dt > end_dt: break - self.log.info( - f"Found {len(sound_files)} files to process that cover the period {start_dt} - {end_dt}" - ) + self.log.info(f"Found {len(sound_files)} files to process that cover the period {start_dt} - {end_dt}") if len(sound_files) == 0: return @@ -170,8 +133,8 @@ def parse_filename(f: str) -> datetime or None: f"files spanning {sound_files[0].start} to {sound_files[-1].start} in self.json_base_dir..." ) - self.log.debug(f" Running metadata corrector for {day}") - corrector = MetadataCorrector( + self.log.debug(f" Running metadata json_gen for {day}") + json_gen = JsonGenerator( self.log, self.df, self.json_base_dir, @@ -180,7 +143,7 @@ def parse_filename(f: str) -> datetime or None: False, self.seconds_per_file, ) - corrector.run() + json_gen.run() except Exception as ex: self.log.exception(str(ex)) @@ -209,7 +172,7 @@ def parse_filename(f: str) -> datetime or None: log, uri="gs://noaa-passive-bioacoustic/nrs/audio/11/nrs_11_2019-2021/audio", json_base_dir=json_dir.as_posix(), - prefix=["NRS11"], + prefixes=["NRS11"], start=start, end=end, ) diff --git a/pbp/json_generator/gen_soundtrap.py b/pbp/meta_gen/gen_soundtrap.py similarity index 54% rename from pbp/json_generator/gen_soundtrap.py rename to pbp/meta_gen/gen_soundtrap.py index 8f96d58..3a4ad49 100644 --- a/pbp/json_generator/gen_soundtrap.py +++ b/pbp/meta_gen/gen_soundtrap.py @@ -1,5 +1,5 @@ # pbp, Apache License 2.0 -# Filename: json_generator/gen_soundtrap.py +# Filename: meta_gen/gen_soundtrap.py # Description: Captures SoundTrap metadata either from a local directory of S3 bucket import urllib from typing import List @@ -7,18 +7,17 @@ import boto3 import datetime import pandas as pd -import re import pytz from datetime import timedelta from pathlib import Path - from progressbar import progressbar +from pbp.meta_gen.utils import get_datetime -from pbp.json_generator.gen_abstract import MetadataGeneratorAbstract -from pbp.json_generator.metadata_extractor import SoundTrapWavFile -from pbp.json_generator.corrector import MetadataCorrector -from pbp.json_generator.utils import parse_s3_or_gcp_url, InstrumentType +from pbp.meta_gen.gen_abstract import MetadataGeneratorAbstract +from pbp.meta_gen.meta_reader import SoundTrapWavFile +from pbp.meta_gen.json_generator import JsonGenerator +from pbp.meta_gen.utils import parse_s3_or_gcp_url, InstrumentType class SoundTrapMetadataGenerator(MetadataGeneratorAbstract): @@ -34,7 +33,7 @@ def __init__( log, # : loguru.Logger, uri: str, json_base_dir: str, - prefix: List[str], + prefixes: List[str], start: datetime.datetime = START, end: datetime.datetime = END, ): @@ -43,7 +42,7 @@ def __init__( The local directory or S3 bucket that contains the wav files :param json_base_dir: The local directory to write the json files to - :param prefix: + :param prefixes: The search pattern to match the wav files, e.g. 'MARS' :param start: The start date to search for wav files @@ -51,7 +50,7 @@ def __init__( The end date to search for wav files check is done. :return: """ - super().__init__(log, uri, json_base_dir, prefix, start, end, 0.0) + super().__init__(log, uri, json_base_dir, prefixes, start, end, 0.0) def run(self): try: @@ -59,9 +58,7 @@ def run(self): xml_cache_path.mkdir(exist_ok=True, parents=True) wav_files = [] - self.log.info( - f"Searching in {self.audio_loc}/*.wav for wav files that match the prefix {self.prefix}* ..." - ) + self.log.info(f"Searching in {self.audio_loc}/*.wav for wav files that match the prefixes {self.prefixes}* ...") bucket, prefix, scheme = parse_s3_or_gcp_url(self.audio_loc) # This does not work for GCS @@ -73,53 +70,16 @@ def run(self): start_dt = self.start - timedelta(days=1) end_dt = self.end + timedelta(days=1) - def get_file_date(xml_file: str) -> datetime: - """ - Check if the xml file is in the search pattern and is within the start and end dates - :param xml_file: - The xml file with the metadata - :return: - Record starting datetime if the file is within the start and end dates; otherwise, return None - """ - xml_file_path = Path(xml_file) - # see if the file is a regexp match to self.prefix - for s in self.prefix: - rc = re.search(s, xml_file_path.stem) - - if rc and rc.group(0): - try: - pattern_date1 = re.compile( - r"(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})" - ) # 20161025T184500Z - search = pattern_date1.search(xml_file_path.stem) - if search: - match = search.groups() - year, month, day, hour, minute, second = map(int, match) - f_path_dt = datetime.datetime( - year, month, day, hour, minute, second - ) - else: - f_path_dt = datetime.datetime.strptime( - xml_file_path.stem.split(".")[1], "%y%m%d%H%M%S" - ) - if start_dt <= f_path_dt <= end_dt: - return f_path_dt - except ValueError: - self.log.error(f"Could not parse {xml_file_path.name}") - return None - if scheme == "file": parsed_uri = urllib.parse.urlparse(self.audio_loc) wav_path = Path(parsed_uri.path) - for filename in progressbar( - sorted(wav_path.rglob("*.xml")), prefix="Searching : " - ): + for filename in progressbar(sorted(wav_path.rglob("*.wav")), prefix="Searching : "): wav_path = filename.parent / f"{filename.stem}.wav" - start_dt = get_file_date(filename) - if start_dt: - wav_files.append( - SoundTrapWavFile(wav_path.as_posix(), filename, start_dt) - ) + xml_path = filename.parent / f"{filename.stem}.xml" + start_dt = get_datetime(wav_path, self.prefixes) + # Must have a start date to be valid and also must have a corresponding xml file + if start_dt and xml_path.exists(): + wav_files.append(SoundTrapWavFile(wav_path.as_posix(), xml_path, start_dt)) else: # if the audio_loc is a s3 url, then we need to list the files in buckets that cover the start and end # dates @@ -130,36 +90,29 @@ def get_file_date(xml_file: str) -> datetime: operation_parameters = {"Bucket": bucket} page_iterator = paginator.paginate(**operation_parameters) - self.log.info( - f"Searching in bucket: {bucket} for .wav and .xml files between {start_dt} and {end_dt} " - ) + self.log.info(f"Searching in bucket: {bucket} for .wav and .xml files between {start_dt} and {end_dt}") + # list the objects in the bucket # loop through the objects and check if they match the search pattern for page in page_iterator: for obj in page["Contents"]: key = obj["Key"] - if ".xml" in key and get_file_date(key): + if ".xml" in key: xml_path = xml_cache_path / key - wav_uri = f"s3://{bucket}/{key}".replace( - "self.log.xml", "wav" - ) - # Check if the xml file is in the cache directory + # Check if the xml file is in the cache directory and download it if not if not xml_path.exists(): - # Download the xml file to a temporary directory self.log.info(f"Downloading {key} ...") client.download_file(bucket, key, xml_path) - start_dt = get_file_date(wav_uri) - if start_dt: - wav_files.append( - SoundTrapWavFile(wav_uri, xml_path, start_dt) - ) + if ".wav" in key: + wav_uri = f"s3://{bucket}/{key}" + wav_dt = get_datetime(wav_uri, self.prefixes) + if wav_dt: + wav_files.append(SoundTrapWavFile(wav_uri, xml_path, wav_dt)) - self.log.info( - f"Found {len(wav_files)} files to process that cover the period {start_dt} - {end_dt}" - ) + self.log.info(f"Found {len(wav_files)} files to process that cover the period {start_dt} - {end_dt}") if len(wav_files) == 0: return @@ -168,16 +121,15 @@ def get_file_date(xml_file: str) -> datetime: wav_files.sort(key=lambda x: x.start) # create a dataframe from the wav files - self.log.info( - f"Creating dataframe from {len(wav_files)} files spanning {wav_files[0].start} to {wav_files[-1].start}..." - ) + self.log.info(f"Creating dataframe from {len(wav_files)} files spanning " + f"{wav_files[0].start} to {wav_files[-1].start}...") for wc in wav_files: df_wav = wc.to_df() # concatenate the metadata to the dataframe self.df = pd.concat([self.df, df_wav], axis=0) - # drop any rows with duplicate uris, keeping the first + # drop any rows with duplicate uris - sometimes the same file is found in multiple searches self.df = self.df.drop_duplicates(subset=["uri"], keep="first") except Exception as ex: @@ -191,16 +143,16 @@ def get_file_date(xml_file: str) -> datetime: # Correct the metadata for each day for day in range(days): - self.log.debug(f"Running metadata corrector for {day}") - corrector = MetadataCorrector( + self.log.debug(f"Running metadata json_gen for {day}") + json_gen = JsonGenerator( self.log, self.df, self.json_base_dir, self.start + timedelta(days=day), - InstrumentType.NRS, + InstrumentType.SOUNDTRAP, False, ) - corrector.run() + json_gen.run() if __name__ == "__main__": diff --git a/pbp/json_generator/corrector.py b/pbp/meta_gen/json_generator.py similarity index 89% rename from pbp/json_generator/corrector.py rename to pbp/meta_gen/json_generator.py index bdf41f8..d420063 100644 --- a/pbp/json_generator/corrector.py +++ b/pbp/meta_gen/json_generator.py @@ -11,10 +11,10 @@ import tempfile import json -from pbp.json_generator.utils import InstrumentType +from pbp.meta_gen.utils import InstrumentType -class MetadataCorrector: +class JsonGenerator: def __init__( self, log, # : loguru.Logger, @@ -26,7 +26,8 @@ def __init__( seconds_per_file: float = -1, ): """ - Correct the metadata for a day and save to a json file + Generate the metadata for a day and save to a json file + Only supports IcListen for drift correction :param raw_df: The dataframe containing the raw metadata to correct :param json_path_out: @@ -36,7 +37,7 @@ def __init__( :param instrument_type: The type of instrument the metadata is coming from: NRS, ICLISTEN, SOUNDTRAP :param time_correct: - True if need to adjust the time stamp based only supported for ICLISTEN + True to need to adjust the time stamp based only supported for ICLISTEN :param seconds_per_file: (optional) number of seconds in each file """ @@ -70,9 +71,7 @@ def run(self): | ((self.raw_df["end"] >= self.day) & (self.raw_df["start"] < self.day)) ] - self.log.debug( - f"Creating metadata for day {self.day} from {len(day_df)} files..." - ) + self.log.debug(f"Creating metadata for day {self.day} from {len(day_df)} files...") if len(day_df) == 0: self.log.warning(f"No metadata found for day {self.day}") @@ -84,9 +83,7 @@ def run(self): day_df["end"] = pd.to_datetime(day_df["end"]) # get the file list that covers the requested day - self.log.info( - f'Found {len(day_df)} files from day {self.day}, starting {day_df.iloc[0]["start"]} ending {day_df.iloc[-1]["end"]}' - ) + self.log.info(f'Found {len(day_df)} files from day {self.day}, starting {day_df.iloc[0]["start"]} ending {day_df.iloc[-1]["end"]}') # if there are no files, then return if len(day_df) == 0: @@ -95,13 +92,8 @@ def run(self): for index, row in day_df.iterrows(): self.log.debug(f'File {row["uri"]} duration {row["duration_secs"]} ') - if ( - self.seconds_per_file > 0 - and row["duration_secs"] != self.seconds_per_file - ): - self.log.warning( - f'File {row["duration_secs"]} != {self.seconds_per_file}. File is not complete' - ) + if 0 < self.seconds_per_file != row["duration_secs"]: + self.log.warning(f'File {row["duration_secs"]} != {self.seconds_per_file}. File is not complete') # check whether there is a discrepancy between the number of seconds in the file and the number # of seconds in the metadata. If there is a discrepancy, then correct the metadata @@ -162,9 +154,7 @@ def run(self): except Exception as e: self.log.exception(f"Error correcting metadata for {self.day}. {e}") finally: - self.log.debug( - f"Done correcting metadata for {self.day}. Saved to {self.json_base_dir}" - ) + self.log.debug(f"Done correcting metadata for {self.day}. Saved to {self.json_base_dir}") def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame: """ @@ -190,13 +180,13 @@ def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame: def save_day(self, day: datetime.datetime, day_df: pd.DataFrame, prefix: str = ""): """ - Save the day's metadata to a single json file either locally or to s3 + Save the day's metadata to a single json file locally :param day: The day to save :param day_df: The dataframe containing the metadata for the day :param prefix: - An optional prefix for the filename + An optional prefixes for the filename :return: """ # if the exception column is full of empty strings, then drop it diff --git a/pbp/json_generator/metadata_extractor.py b/pbp/meta_gen/meta_reader.py similarity index 96% rename from pbp/json_generator/metadata_extractor.py rename to pbp/meta_gen/meta_reader.py index bf6fb52..21abd78 100755 --- a/pbp/json_generator/metadata_extractor.py +++ b/pbp/meta_gen/meta_reader.py @@ -1,6 +1,8 @@ # pbp, Apache License 2.0 -# Filename: json_generator/metadata_extractor.py -# Description: Utilities for wav file metadata reading. Supports SoundTrap, NRS and and icListen audio files +# Filename: meta_gen/meta_reader.py +# Description: Utilities for efficiently reading audio metadata either locally or from a remote source. +# Wraps the metadata into classes for easy access and transformation into a pandas dataframe. +# Supports SoundTrap, NRS and icListen audio files from pathlib import Path from typing import Optional @@ -12,7 +14,7 @@ import pandas as pd from datetime import datetime, timedelta import xml.etree.ElementTree as ET -from pbp.json_generator.utils import parse_s3_or_gcp_url +from pbp.meta_gen.utils import parse_s3_or_gcp_url class AudioFile: @@ -38,7 +40,7 @@ def has_exception(self): return True if len(self.exception) > 0 else False def to_df(self): - # if the self.path_or_url is a url, then add to the data frame with the appropriate prefix + # if the self.path_or_url is a url, then add to the data frame with the appropriate prefixes if "s3://" in self.path_or_url or "gs://" in self.path_or_url: df = pd.DataFrame( { diff --git a/pbp/meta_gen/utils.py b/pbp/meta_gen/utils.py new file mode 100644 index 0000000..6bd552d --- /dev/null +++ b/pbp/meta_gen/utils.py @@ -0,0 +1,87 @@ +# pbp, Apache License 2.0 +# Filename: meta_gen/utils.py +# Description: Utility functions for parsing S3, GS or local file urls and defining sound instrument types for metadata generation +import re +from typing import Tuple, List +from urllib.parse import urlparse +from datetime import datetime +from pathlib import Path + + +class InstrumentType: + NRS = "NRS" + ICLISTEN = "ICLISTEN" + SOUNDTRAP = "SOUNDTRAP" + + +def parse_s3_or_gcp_url(url) -> Tuple[str, str, str]: + """ + Parse the S3, GS of local file url + :param url: The url to parse, e.g. s3://bucket/prefix, gs://bucket/prefix, file://path/to/file + :return: a tuple with the bucket, prefix and scheme + """ + parsed_url = urlparse(url) + bucket = parsed_url.netloc + prefix = parsed_url.path.lstrip("/") + if parsed_url.scheme == "file": + bucket = "" + prefix = parsed_url.path + return bucket, prefix, parsed_url.scheme + + +# Function to extract the timecode +def extract_timecode(filename: str, prefixes: List[str]): + """ + Extract the timecode from a filename + :param filename: The filename to extract the timecode from + :param prefixes: The prefixes to match the filename, e.g. MARS, NRS11, 6000 + :return: The timecode or None if the timecode cannot be extracted + """ + # Define the regex patterns for the different formats, e.g. MARS_YYYYMMDD_HHMMSS.wav, NRS11_20191023_222213.flac, + # 6000.221111155338.wav + patterns = { + "underscore_format1": r"{}[._]?(\d{{8}})_(\d{{6}})\.\w+$", + "underscore_format2": r"{}[._]?(\d{{6}})_(\d{{6}})\.\w+$", + "dot_format": r"{}[._]?(\d{{12}})\.\w+$", + "iso_format": r"{}[._]?(\d{{8}}T\d{{6}}Z)\.\w+$" + } + for prefix in prefixes: + for pattern_name, pattern in patterns.items(): + regex = pattern.format(prefix) + match = re.match(regex, Path(filename).name) + if match: + timecode_parts = match.groups() + # Correct the seconds if they are 60 - this happens in some NRS files + hhmmss = timecode_parts[-1] + if hhmmss[-2:] == "60": + hhmmss = hhmmss[:-2] + "59" + corrected_timecode = timecode_parts[:-1] + (hhmmss,) + return "".join(corrected_timecode) + + return "".join(timecode_parts) + return None + + +def get_datetime(time_str: str, prefixes: List[str]): + """ + Parse all possible time formats in the time_str into a datetime object + :param time_str: The time string to parse + :param prefixes: The prefixes to match the filename, e.g. MARS, NRS11, 6000 + :return: datetime object or None if the time_str cannot be parsed + """ + time_str = extract_timecode(time_str, prefixes) + if time_str is None: + return None + possible_dt_formats = ["%Y%m%d_%H%M%S", + "%y%m%d_%H%M%S", + "%y%m%d%H%M%S", + "%Y%m%d%H%M%S", + "%Y%m%dT%H%M%SZ", + "%Y%m%dT%H%M%S"] + for fmt in possible_dt_formats: + try: + return datetime.strptime(time_str, fmt) + except ValueError: + continue + + return None diff --git a/pbp/misc_helper.py b/pbp/misc_helper.py index 83de581..371da2a 100644 --- a/pbp/misc_helper.py +++ b/pbp/misc_helper.py @@ -35,10 +35,10 @@ def gen_hour_minute_times( def map_prefix(prefix_map: str, s: str) -> str: """ - Helper to replace a prefix to another prefix in given string + Helper to replace a prefixes to another prefixes in given string according to prefix_map. :param prefix_map: Like "old~new". - :param s: The string to replace the prefix in. + :param s: The string to replace the prefixes in. :return: Resulting string, possibly unchanged. """ if "~" in prefix_map: diff --git a/pbp/process_helper.py b/pbp/process_helper.py index b261fee..e7b2600 100644 --- a/pbp/process_helper.py +++ b/pbp/process_helper.py @@ -52,7 +52,7 @@ def __init__( :param output_dir: Output directory. :param output_prefix: - Output filename prefix. + Output filename prefixes. :param gen_netcdf: True to generate the netCDF file. :param global_attrs_uri: diff --git a/pbp/simpleapi.py b/pbp/simpleapi.py index 7c4f6bd..c862c4e 100644 --- a/pbp/simpleapi.py +++ b/pbp/simpleapi.py @@ -93,7 +93,7 @@ def set_output_dir(self, output_dir: str) -> None: def set_output_prefix(self, output_prefix: str) -> None: """ - Set the output prefix. + Set the output prefixes. """ self._output_prefix = output_prefix diff --git a/pyproject.toml b/pyproject.toml index a940199..d4d5141 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ packages = [{include = "pbp"}] pbp = "pbp.main:main" pbp-cloud = "pbp.main_cloud:main" pbp-plot = "pbp.plot:main" -pbp-json-gen = "pbp.main_json_generator:main" +pbp-meta-gen = "pbp.main_json_generator:main" [tool.poetry.dependencies] python = ">=3.9,<3.12.0" @@ -60,7 +60,7 @@ exclude = [ ] [tool.ruff] -line-length = 90 +line-length = 150 exclude = [ "virtenv/**" ] diff --git a/scripts/nrs11.py b/scripts/nrs11.py index 3cdc830..7cf5cdf 100644 --- a/scripts/nrs11.py +++ b/scripts/nrs11.py @@ -24,7 +24,7 @@ # Location for generated files: output_dir = "NRS11/OUTPUT" -# A prefix for the name of generate files: +# A prefixes for the name of generate files: output_prefix = "NRS11_" diff --git a/tests/test_json_generator.py b/tests/test_meta_generator.py similarity index 77% rename from tests/test_json_generator.py rename to tests/test_meta_generator.py index 30d525a..afc820c 100644 --- a/tests/test_json_generator.py +++ b/tests/test_meta_generator.py @@ -15,9 +15,9 @@ from pathlib import Path from pbp.logging_helper import create_logger -from pbp.json_generator.gen_nrs import NRSMetadataGenerator -from pbp.json_generator.gen_soundtrap import SoundTrapMetadataGenerator -from pbp.json_generator.gen_iclisten import IcListenMetadataGenerator +from pbp.meta_gen.gen_nrs import NRSMetadataGenerator +from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator +from pbp.meta_gen.gen_iclisten import IcListenMetadataGenerator # which is .gitignore'ed @@ -38,6 +38,9 @@ def create_test_logger(name: str): def create_json_dir(name: str) -> Path: json_dir = OUT_BASE_DIR / name + if json_dir.exists(): + import shutil + shutil.rmtree(json_dir) json_dir.mkdir(exist_ok=True, parents=True) return json_dir @@ -74,14 +77,14 @@ def get_aws_account() -> Union[str, None]: not AWS_AVAILABLE, reason="This test is excluded because it requires a valid AWS account", ) -def test_soundtrap_json_generator(): +def test_soundtrap_generator(): """ Test fixture for SoundTrapMetadataGenerator. Tests the SoundTrapMetadataGenerator class ability to generate metadata for soundtrap recording files. Two files should be generated in the json directory for the dates specified. :return: """ - log = create_test_logger("test_soundtrap_metadata_generator") + log = create_test_logger("test_soundtrap_generator") json_dir = create_json_dir("soundtrap") start = datetime(2023, 7, 15) @@ -90,7 +93,7 @@ def test_soundtrap_json_generator(): log=log, uri="s3://pacific-sound-ch01", json_base_dir=json_dir.as_posix(), - prefix=["7000"], + prefixes=["7000"], start=start, end=end, ) @@ -113,7 +116,7 @@ def test_soundtrap_json_generator(): not AWS_AVAILABLE, reason="This test is excluded because it requires a valid AWS account", ) -def test_iclisten_json_generator(): +def test_iclisten_generator(): """ Test fixture for IcListenMetadataGenerator. Tests the IcListenMetadataGenerator class ability to generate metadata for soundtrap recording files. @@ -121,7 +124,7 @@ def test_iclisten_json_generator(): only works for MBARI MARS ICListen data :return: """ - log = create_test_logger("test_mars_metadata_generator") + log = create_test_logger("test_iclisten_generator") json_dir = create_json_dir("mars") start = datetime(2023, 7, 18, 0, 0, 0) @@ -132,7 +135,7 @@ def test_iclisten_json_generator(): log=log, uri="s3://pacific-sound-256khz", json_base_dir=json_dir.as_posix(), - prefix=["MARS_"], + prefixes=["MARS"], start=start, end=end, seconds_per_file=600, @@ -150,14 +153,14 @@ def test_iclisten_json_generator(): assert len(json_objects) == 145 -def test_nrs_json_generator(): +def test_nrs_generator(): """ Test fixture for NRSMetadataGenerator. Tests the NRSMetadataGenerator class ability to generate metadata for NRS recording files. One files should be generated in the json directory for the date specified. :return: """ - log = create_test_logger("test_nrs_metadata_generator") + log = create_test_logger("test_nrs_generator") json_dir = create_json_dir("nrs") start = datetime(2019, 10, 24, 0, 0, 0) @@ -167,7 +170,7 @@ def test_nrs_json_generator(): log=log, uri="gs://noaa-passive-bioacoustic/nrs/audio/11/nrs_11_2019-2021/audio", json_base_dir=json_dir.as_posix(), - prefix=["NRS11"], + prefixes=["NRS11"], start=start, end=end, seconds_per_file=14400.0, @@ -183,3 +186,31 @@ def test_nrs_json_generator(): with open(json_file) as f: json_objects = json.load(f) assert len(json_objects) == 7 + + +def test_datetime_support(): + """ + Test fixture for all audio file formats. + Tests the ability to extract the datetime from the audio file name. + :return: + """ + filenames = [ + "s3://MARS_20191022_235758.wav", + "gs://6550.221113155338.wav", + "NRS11_20191023_222260.flac", # Invalid seconds example + "gs://6000.221011155338.wav", + "MARS_20191022T235743Z.wav", + "6000.230111155338.wav" + ] + prefixes = ["MARS_", "6550", "NRS11_", "6000", "MARS_", "6000"] + expected = [ + datetime(2019, 10, 22, 23, 57, 58), + datetime(2022, 11, 13, 15, 53, 38), + datetime(2019, 10, 23, 22, 22, 59), + datetime(2022, 10, 11, 15, 53, 38), + datetime(2019, 10, 22, 23, 57, 43), + datetime(2023, 1, 11, 15, 53, 38) + ] + from pbp.meta_gen.utils import get_datetime + for i, filename in enumerate(filenames): + assert get_datetime(filename, [prefixes[i]]) == expected[i]