Skip to content

Commit

Permalink
search for non_event_indices now more memory efficient (#317)
Browse files Browse the repository at this point in the history
Co-authored-by: Josef Haupt <josef.haupt@phil.tu-chemnitz.de>
  • Loading branch information
Josef-Haupt and Josef Haupt committed Apr 29, 2024
1 parent 2f0df2a commit 1e55caf
Showing 1 changed file with 28 additions and 16 deletions.
44 changes: 28 additions & 16 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Module containing common function.
"""

import os
import traceback
import numpy as np
Expand Down Expand Up @@ -27,6 +28,7 @@ def collect_audio_files(path: str):

return sorted(files)


def collect_all_files(path: str, filetypes: list[str], pattern: str = ""):
"""Collects all files of the given filetypes in the given directory.
Expand Down Expand Up @@ -76,6 +78,7 @@ def list_subdirectories(path: str):
"""
return filter(lambda el: os.path.isdir(os.path.join(path, el)), os.listdir(path))


def random_multilabel_split(x, y, val_ratio=0.2):
"""Splits the data into training and validation data.
Expand All @@ -88,15 +91,15 @@ def random_multilabel_split(x, y, val_ratio=0.2):
Returns:
A tuple of (x_train, y_train, x_val, y_val).
"""

# Set numpy random seed
np.random.seed(cfg.RANDOM_SEED)

# Find all combinations of labels
class_combinations = np.unique(y, axis=0)

# Initialize training and validation data
x_train, y_train, x_val, y_val = [], [], [], []

Expand All @@ -118,13 +121,13 @@ def random_multilabel_split(x, y, val_ratio=0.2):
# Randomly choose samples for training and validation
np.random.shuffle(indices)
train_indices = indices[:num_samples_train]
val_indices = indices[num_samples_train:num_samples_train + num_samples_val]
val_indices = indices[num_samples_train : num_samples_train + num_samples_val]
# Append samples to training and validation data
x_train.append(x[train_indices])
y_train.append(y[train_indices])
x_val.append(x[val_indices])
y_val.append(y[val_indices])

# Concatenate data
x_train = np.concatenate(x_train)
y_train = np.concatenate(y_train)
Expand All @@ -142,7 +145,8 @@ def random_multilabel_split(x, y, val_ratio=0.2):
x_val = x_val[indices]
y_val = y_val[indices]

return x_train, y_train, x_val, y_val
return x_train, y_train, x_val, y_val


def random_split(x, y, val_ratio=0.2):
"""Splits the data into training and validation data.
Expand Down Expand Up @@ -183,32 +187,31 @@ def random_split(x, y, val_ratio=0.2):
# Randomly choose samples for training and validation
np.random.shuffle(positive_indices)
train_indices = positive_indices[:num_samples_train]
val_indices = positive_indices[num_samples_train:num_samples_train + num_samples_val]
val_indices = positive_indices[num_samples_train : num_samples_train + num_samples_val]

# Append samples to training and validation data
x_train.append(x[train_indices])
y_train.append(y[train_indices])
x_val.append(x[val_indices])
y_val.append(y[val_indices])

# Append negative samples to training data
x_train.append(x[negative_indices])
y_train.append(y[negative_indices])

# Add samples for non-event classes to training and validation data
non_event_indices = np.where(y[:,:] == 0)[0]
non_event_indices = np.where(np.sum(y[:, :], axis=1) == 0)[0]
num_samples = len(non_event_indices)
num_samples_train = max(1, int(num_samples * (1 - val_ratio)))
num_samples_val = max(0, num_samples - num_samples_train)
np.random.shuffle(non_event_indices)
train_indices = non_event_indices[:num_samples_train]
val_indices = non_event_indices[num_samples_train:num_samples_train + num_samples_val]
val_indices = non_event_indices[num_samples_train : num_samples_train + num_samples_val]
x_train.append(x[train_indices])
y_train.append(y[train_indices])
x_val.append(x[val_indices])
y_val.append(y[val_indices])


# Concatenate data
x_train = np.concatenate(x_train)
y_train = np.concatenate(y_train)
Expand Down Expand Up @@ -258,7 +261,7 @@ def mixup(x, y, augmentation_ratio=0.25, alpha=0.2):
mixed_up_indices = []

for _ in range(num_samples_to_augment):

# Randomly choose one instance from the positive samples
index = np.random.choice(positive_indices)

Expand Down Expand Up @@ -292,7 +295,7 @@ def mixup(x, y, augmentation_ratio=0.25, alpha=0.2):

del mixed_x
del mixed_y

return x, y


Expand Down Expand Up @@ -379,7 +382,7 @@ def applyMean(x, y, random_indices):
random_indices = np.random.choice(np.where(y == minority_label)[0], 2)

# Calculate the mean of the two samples
applyMean(x, y, random_indices)
applyMean(x, y, random_indices)

else:
for i in range(y.shape[1]):
Expand Down Expand Up @@ -460,7 +463,7 @@ def applySmote(x, y, random_index, k=5):
random_index = np.random.choice(np.where(y == minority_label)[0])

# Apply SMOTE
applySmote(x, y, random_index)
applySmote(x, y, random_index)

else:
for i in range(y.shape[1]):
Expand Down Expand Up @@ -501,7 +504,14 @@ def saveToCache(cache_file: str, x_train: np.ndarray, y_train: np.ndarray, label
os.makedirs(os.path.dirname(cache_file), exist_ok=True)

# Save to cache
np.savez_compressed(cache_file, x_train=x_train, y_train=y_train, labels=labels, binary_classification=cfg.BINARY_CLASSIFICATION, multi_label=cfg.MULTI_LABEL)
np.savez_compressed(
cache_file,
x_train=x_train,
y_train=y_train,
labels=labels,
binary_classification=cfg.BINARY_CLASSIFICATION,
multi_label=cfg.MULTI_LABEL,
)


def loadFromCache(cache_file: str):
Expand Down Expand Up @@ -547,12 +557,14 @@ def writeErrorLog(ex: Exception):
with open(cfg.ERROR_LOG_FILE, "a") as elog:
elog.write("".join(traceback.TracebackException.from_exception(ex).format()) + "\n")


def img2base64(path):

import base64

with open(path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode('utf-8')
return base64.b64encode(img_file.read()).decode("utf-8")


def save_model_params(file_path):
"""Saves the params used to train the custom classifier.
Expand Down

0 comments on commit 1e55caf

Please sign in to comment.