From 9c7e4b6c16c9460b79ebd444cefc1d7855cdce44 Mon Sep 17 00:00:00 2001 From: AllenDowney Date: Wed, 7 Sep 2022 14:39:06 +0000 Subject: [PATCH 1/2] Checking for missing miles in parallel --- zamba/models/config.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/zamba/models/config.py b/zamba/models/config.py index 426a126e..31a95dc8 100644 --- a/zamba/models/config.py +++ b/zamba/models/config.py @@ -7,9 +7,11 @@ import appdirs import ffmpeg from loguru import logger +import numpy as np import pandas as pd from pydantic import BaseModel from pydantic import DirectoryPath, FilePath, validator, root_validator +from pqdm.threads import pqdm import torch from tqdm import tqdm import yaml @@ -100,11 +102,23 @@ def check_files_exist_and_load( # we can have multiple rows per file with labels so limit just to one row per file for these checks files_df = df[["filepath"]].drop_duplicates() - # check data exists - logger.info( - f"Checking all {len(files_df):,} filepaths exist. Can take up to a minute for every couple thousand files." - ) - exists = files_df["filepath"].path.exists() + # check for missing files + logger.info(f"Checking all {len(files_df):,} filepaths exist. Trying fast file checking...") + + # try to check files in parallel + paths = files_df["filepath"].apply(Path) + exists = pqdm(paths, Path.exists, n_jobs=16) + exists = np.array(exists) + + # if fast checking fails, fall back to slow checking + # if an I/O error is in `exists`, the array has dtype `object` + if exists.dtype != bool: + logger.info( + "Fast file checking failed. Running slower check, which can take 30 seconds per thousand files." + ) + exists = files_df["filepath"].path.exists() + + # select the missing files invalid_files = files_df[~exists] # if no files exist From b5fd541386e3c7a9e0b98c71ca0597c29c1e0873 Mon Sep 17 00:00:00 2001 From: AllenDowney Date: Wed, 7 Sep 2022 15:33:31 +0000 Subject: [PATCH 2/2] Adding pqdm to setup.cfg --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 1eb71336..d5895fd7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,7 @@ install_requires = openpyxl pandas>1.2.0 pandas_path + pqdm pydantic python-dotenv pytorch-lightning>=1.6.0