Skip to content

Commit

Permalink
added coverage plot, explicit undercore and removed any AWS specific …
Browse files Browse the repository at this point in the history
…requirements
  • Loading branch information
danellecline committed Aug 21, 2024
1 parent 90c3419 commit f92cd8d
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 67 deletions.
20 changes: 13 additions & 7 deletions pbp/meta_gen/gen_iclisten.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# pbp, Apache License 2.0
# Filename: metadata/generator/gen_iclisten.py
# Description: Captures ICListen wav metadata in a pandas dataframe from either a local directory or S3 bucket.

import os
from datetime import timedelta
from datetime import datetime
from typing import List
Expand All @@ -11,7 +11,7 @@
import pandas as pd
from pathlib import Path
from progressbar import progressbar
import pbp.meta_gen.utils as utils
from pbp.meta_gen.utils import InstrumentType, parse_s3_or_gcp_url, get_datetime, plot_daily_coverage
from pbp.meta_gen.json_generator import JsonGenerator
from pbp.meta_gen.meta_reader import GenericWavFile
from pbp.meta_gen.gen_abstract import MetadataGeneratorAbstract
Expand Down Expand Up @@ -54,7 +54,7 @@ def run(self):
f"{self.log_prefix} Generating metadata for {self.start} to {self.end}..."
)

bucket_name, prefix, scheme = utils.parse_s3_or_gcp_url(self.audio_loc)
bucket_name, prefix, scheme = parse_s3_or_gcp_url(self.audio_loc)

# gs is not supported for icListen
if scheme == "gs":
Expand Down Expand Up @@ -84,15 +84,19 @@ def run(self):
for filename in progressbar(
sorted(wav_path.rglob("*.wav")), prefix="Searching : "
):
wav_dt = utils.get_datetime(filename, self.prefixes)
wav_dt = get_datetime(filename, self.prefixes)
if wav_dt and start_dt <= wav_dt <= end_dt:
self.log.info(
f"Found file {filename} with timestamp {wav_dt}"
)
wav_files.append(GenericWavFile(self.log, filename, wav_dt))

if scheme == "s3":
client = boto3.client("s3")
kwargs = {}
aws_region = os.getenv("AWS_REGION")
if aws_region is not None:
kwargs["region_name"] = aws_region
client = boto3.client("s3", **kwargs)
for day_hour in pd.date_range(start=start_dt, end=end_dt, freq="h"):
bucket = f"{bucket_name}-{day_hour.year:04d}"

Expand All @@ -117,7 +121,7 @@ def run(self):

for obj in page["Contents"]:
key = obj["Key"]
wav_dt = utils.get_datetime(
wav_dt = get_datetime(
f"s3://{bucket}/{key}", self.prefixes
)
if wav_dt is None:
Expand Down Expand Up @@ -165,7 +169,7 @@ def run(self):
self.df,
self.json_base_dir,
day,
utils.InstrumentType.NRS,
InstrumentType.ICLISTEN,
True,
self.seconds_per_file,
)
Expand All @@ -174,6 +178,8 @@ def run(self):
except Exception as ex:
self.log.exception(str(ex))

plot_file = plot_daily_coverage(InstrumentType.ICLISTEN, self.df, self.json_base_dir, self.start, self.end)
self.log.info(f"Plot file: {plot_file}")

if __name__ == "__main__":
from pbp.logging_helper import create_logger
Expand Down
5 changes: 4 additions & 1 deletion pbp/meta_gen/gen_nrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pbp.meta_gen.json_generator import JsonGenerator
from pbp.meta_gen.meta_reader import FlacFile, GenericWavFile as WavFile
from pbp.meta_gen.gen_abstract import MetadataGeneratorAbstract
from pbp.meta_gen.utils import parse_s3_or_gcp_url, InstrumentType
from pbp.meta_gen.utils import parse_s3_or_gcp_url, InstrumentType, plot_daily_coverage


class NRSMetadataGenerator(MetadataGeneratorAbstract):
Expand Down Expand Up @@ -152,6 +152,9 @@ def run(self):
except Exception as ex:
self.log.exception(str(ex))

# plot the daily coverage
plot_file = plot_daily_coverage(InstrumentType.NRS, self.df, self.json_base_dir, self.start, self.end)
self.log.info(f"Coverage plot saved to {plot_file}")

if __name__ == "__main__":
from pbp.logging_helper import create_logger
Expand Down
42 changes: 23 additions & 19 deletions pbp/meta_gen/gen_soundtrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
from datetime import timedelta
from pathlib import Path
from progressbar import progressbar
from pbp.meta_gen.utils import get_datetime

from pbp.meta_gen.gen_abstract import MetadataGeneratorAbstract
from pbp.meta_gen.meta_reader import SoundTrapWavFile
from pbp.meta_gen.json_generator import JsonGenerator
from pbp.meta_gen.utils import parse_s3_or_gcp_url, InstrumentType
from pbp.meta_gen.utils import parse_s3_or_gcp_url, InstrumentType, get_datetime, plot_daily_coverage


class SoundTrapMetadataGenerator(MetadataGeneratorAbstract):
Expand Down Expand Up @@ -82,7 +81,7 @@ def run(self):
xml_path = filename.parent / f"{filename.stem}.xml"
start_dt = get_datetime(wav_path, self.prefixes)
# Must have a start date to be valid and also must have a corresponding xml file
if start_dt and xml_path.exists():
if start_dt and xml_path.exists() and start_dt <= start_dt <= end_dt:
wav_files.append(
SoundTrapWavFile(wav_path.as_posix(), xml_path, start_dt)
)
Expand All @@ -105,22 +104,23 @@ def run(self):
for page in page_iterator:
for obj in page["Contents"]:
key = obj["Key"]

if ".xml" in key:
xml_path = xml_cache_path / key

# Check if the xml file is in the cache directory and download it if not
if not xml_path.exists():
self.log.info(f"Downloading {key} ...")
client.download_file(bucket, key, xml_path)

if ".wav" in key:
wav_uri = f"s3://{bucket}/{key}"
wav_dt = get_datetime(wav_uri, self.prefixes)
if wav_dt:
wav_files.append(
SoundTrapWavFile(wav_uri, xml_path, wav_dt)
)
uri = f"s3://{bucket}/{key}"
key_dt = get_datetime(uri, self.prefixes)
xml_path = xml_cache_path / key
xml_path = xml_path.with_suffix(".xml")
key_xml = key.replace(".wav", ".log.xml")

if key_dt is None:
continue
if start_dt <= key_dt <= end_dt and key.endswith(".wav"):
# download the associated xml file to the wav file and create a SoundTrapWavFile object
try:
self.log.info(f"Downloading {key_xml} ...")
client.download_file(bucket, key_xml, xml_path)
wav_files.append(SoundTrapWavFile(uri, xml_path, key_dt))
except Exception as ex:
self.log.error(f"Could not download {key_xml} - {str(ex)}")
continue

self.log.info(
f"Found {len(wav_files)} files to process that cover the period {start_dt} - {end_dt}"
Expand Down Expand Up @@ -168,6 +168,10 @@ def run(self):
)
json_gen.run()

# plot the daily coverage
plot_file = plot_daily_coverage(InstrumentType.SOUNDTRAP, self.df, self.json_base_dir, self.start, self.end)
self.log.info(f"Coverage plot saved to {plot_file}")


if __name__ == "__main__":
from pbp.logging_helper import create_logger
Expand Down
40 changes: 40 additions & 0 deletions pbp/meta_gen/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from datetime import datetime
from pathlib import Path

import pandas as pd


class InstrumentType:
NRS = "NRS"
Expand Down Expand Up @@ -87,3 +89,41 @@ def get_datetime(time_str: str, prefixes: List[str]):
continue

return None


def plot_daily_coverage(instrument_type: InstrumentType, df: pd.DataFrame, base_dir: str, start: datetime, end: datetime) -> str:
"""
Plot the daily coverage of the recordings
:param instrument_type: The type of instrument, e.g. NRS, ICLISTEN, SOUNDTRAP
:param df: The dataframe with the recordings
:param base_dir: The base directory to store the plot
:param start: The start date of the recordings
:param end: The end date of the recordings
"""
# Create a plot of the dataframe with the x-axis as the month, and the y-axis as the daily recording coverage,
# which is percent of the day covered by recordings
df["duration"] = (df["end"] - df["start"]).dt.total_seconds()
ts_df = df[["start", "duration"]].copy()
ts_df.set_index('start', inplace=True)
daily_sum_df = ts_df.resample('D').sum()
daily_sum_df["coverage"] = 100 * daily_sum_df["duration"] / 86400
daily_sum_df["coverage"] = daily_sum_df["coverage"].round() # round to nearest integer
plot = daily_sum_df["coverage"].plot()
plot.set_ylabel("Daily Coverage (%)")
plot.set_xlabel("Date")
plot.set_xticklabels([x.strftime('%Y-%m-%d') for x in daily_sum_df.index])
# Angle the x-axis labels for better readability and force them to be in the format YYYY-MM-DD
plot.set_xticklabels(plot.get_xticklabels(), rotation=45, horizontalalignment='right')
if instrument_type == InstrumentType.NRS:
plot.set_title("Daily Coverage of NRS Recordings")
elif instrument_type == InstrumentType.ICLISTEN:
plot.set_title("Daily Coverage of icListen Recordings")
elif instrument_type == InstrumentType.SOUNDTRAP:
plot.set_title("Daily Coverage of SoundTrap Recordings")
plot_file = Path(base_dir) / f"soundtrap_coverage_{start:%Y%m%d}_{end:%Y%m%d}.png"
dpi = 300
fig = plot.get_figure()
fig.set_size_inches(10, 5)
fig.set_dpi(dpi)
fig.savefig(plot_file.as_posix(), bbox_inches="tight")
return plot_file.as_posix()
54 changes: 14 additions & 40 deletions tests/test_meta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator
from pbp.meta_gen.gen_iclisten import IcListenMetadataGenerator


# which is .gitignore'ed
OUT_BASE_DIR = Path("tests/json_generator_tmp")

Expand All @@ -46,38 +45,6 @@ def create_json_dir(name: str) -> Path:
return json_dir


def get_aws_account() -> Union[str, None]:
"""
Get the account number associated with this user
:return:
"""
try:
account_number = boto3.client("sts").get_caller_identity()["Account"]
print(f"Found account {account_number}")
return account_number
except ClientError as e:
print(e)
msg = (
"Could not get account number from AWS. Check your config.ini file. "
"Account number is not set in the config.ini file and AWS credentials are not configured."
)
print(msg)
return None
except botocore.exceptions.NoCredentialsError as e:
print(e)
return None


# Check if an AWS account is configured by checking if it can access the model with the default credentials
AWS_AVAILABLE = False
if get_aws_account():
AWS_AVAILABLE = True


@pytest.mark.skipif(
not AWS_AVAILABLE,
reason="This test is excluded because it requires a valid AWS account",
)
def test_soundtrap_generator():
"""
Test fixture for SoundTrapMetadataGenerator.
Expand All @@ -100,7 +67,7 @@ def test_soundtrap_generator():
)
gen.run()

# There should be two files in the json directory named 20230715.json and 20230716.json
# There should be two files in the json directory - one for each day
json_files = list(json_dir.rglob("*.json"))
assert len(json_files) == 2
assert (json_dir / "2023/20230715.json").exists()
Expand All @@ -112,11 +79,10 @@ def test_soundtrap_generator():
json_objects = json.load(f)
assert len(json_objects) == 5

# There should also be a coverage plot in the base json directory
coverage_plot = json_dir / "soundtrap_coverage_20230715_20230716.png"
assert coverage_plot.exists()

@pytest.mark.skipif(
not AWS_AVAILABLE,
reason="This test is excluded because it requires a valid AWS account",
)
def test_iclisten_generator():
"""
Test fixture for IcListenMetadataGenerator.
Expand All @@ -136,7 +102,7 @@ def test_iclisten_generator():
log=log,
uri="s3://pacific-sound-256khz",
json_base_dir=json_dir.as_posix(),
prefixes=["MARS"],
prefixes=["MARS_"],
start=start,
end=end,
seconds_per_file=600,
Expand All @@ -153,6 +119,10 @@ def test_iclisten_generator():
json_objects = json.load(f)
assert len(json_objects) == 145

# There should also be a coverage plot in the base json directory
coverage_plot = json_dir / "soundtrap_coverage_20230718_20230718.png"
assert coverage_plot.exists()


def test_nrs_generator():
"""
Expand All @@ -171,7 +141,7 @@ def test_nrs_generator():
log=log,
uri="gs://noaa-passive-bioacoustic/nrs/audio/11/nrs_11_2019-2021/audio",
json_base_dir=json_dir.as_posix(),
prefixes=["NRS11"],
prefixes=["NRS11_"],
start=start,
end=end,
seconds_per_file=14400.0,
Expand All @@ -188,6 +158,10 @@ def test_nrs_generator():
json_objects = json.load(f)
assert len(json_objects) == 7

# There should also be a coverage plot in the base json directory
coverage_plot = json_dir / "soundtrap_coverage_20191024_20191024.png"
assert coverage_plot.exists()


def test_datetime_support():
"""
Expand Down

0 comments on commit f92cd8d

Please sign in to comment.