Skip to content

Commit

Permalink
Remove pandas.append to resolve issue pylabel-project#102
Browse files Browse the repository at this point in the history
  • Loading branch information
alexheat committed Apr 29, 2023
1 parent 721649b commit 224e5d8
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 76 deletions.
194 changes: 120 additions & 74 deletions pylabel/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,150 +3,196 @@
from sklearn.model_selection import GroupShuffleSplit as sklearnGroupShuffleSplit
from pylabel.shared import schema

class Split():
def __init__(self, dataset=None):

class Split:
def __init__(self, dataset=None):
self.dataset = dataset

def UnSplit(self):
"""Unsplit the dataset by setting all values of the split column to null."""
self.dataset.df["split"] = np.nan

def GroupShuffleSplit(self, train_pct=.5, test_pct=.25, val_pct=.25, group_col = 'img_filename', random_state=None):
def GroupShuffleSplit(
self,
train_pct=0.5,
test_pct=0.25,
val_pct=0.25,
group_col="img_filename",
random_state=None,
):
"""
This function uses the GroupShuffleSplit command from sklearn. It can split into 3 groups (train,
test, and val) by applying the command twice.
test, and val) by applying the command twice.
"""
df_main = self.dataset.df
gss = sklearnGroupShuffleSplit(n_splits=1, train_size=train_pct)
train_indexes, test_indexes = next(gss.split(X=df_main, y=df_main[group_col], groups=df_main.index.values))
train_indexes, test_indexes = next(
gss.split(X=df_main, y=df_main[group_col], groups=df_main.index.values)
)

df_main.loc[train_indexes,'split'] = "train"
df_main.loc[test_indexes,'split'] = "test"
df_main.loc[train_indexes, "split"] = "train"
df_main.loc[test_indexes, "split"] = "test"
self.dataset.df = df_main

if val_pct:
df_train = df_main.loc[df_main['split'] == 'train']
df_test = df_main.loc[df_main['split'] == 'test']
df_train = df_main.loc[df_main["split"] == "train"]
df_test = df_main.loc[df_main["split"] == "test"]
df_test = df_test.reset_index()
second_split_pct = float(test_pct/(test_pct+val_pct))
second_split_pct = float(test_pct / (test_pct + val_pct))
gss2 = sklearnGroupShuffleSplit(n_splits=1, train_size=second_split_pct)
test_indexes_2, val_indexes_2 = next(gss2.split(X=df_test, y=df_test[group_col], groups=df_test.index.values))
df_test.loc[test_indexes_2,'split'] = "test"
df_test.loc[val_indexes_2,'split'] = "val"
self.dataset.df = df_train.append(df_test)
test_indexes_2, val_indexes_2 = next(
gss2.split(X=df_test, y=df_test[group_col], groups=df_test.index.values)
)
df_test.loc[test_indexes_2, "split"] = "test"
df_test.loc[val_indexes_2, "split"] = "val"
self.dataset.df = pd.concat([df_train, df_test])
self.dataset.df = self.dataset.df.reset_index(drop=True)
self.dataset.df = self.dataset.df[schema]

#Written with the help of https://stackoverflow.com/questions/56872664/complex-dataset-split-stratifiedgroupshufflesplit
def StratifiedGroupShuffleSplit(self, train_pct=.7, test_pct=.3, val_pct=.0, weight=0.01,
group_col = 'img_filename', cat_col = 'cat_name', batch_size=1):
# Written with the help of https://stackoverflow.com/questions/56872664/complex-dataset-split-stratifiedgroupshufflesplit
def StratifiedGroupShuffleSplit(
self,
train_pct=0.7,
test_pct=0.3,
val_pct=0.0,
weight=0.01,
group_col="img_filename",
cat_col="cat_name",
batch_size=1,
):
"""
This function will 'split" the dataframe by setting the split collumn equal to
This function will 'split" the dataframe by setting the split collumn equal to
train, test, or val. When a split dataset is exported the annotations will be split into
seperate groups so that can be used used in model training, testing, and validation.
"""
df_main = self.dataset.df
df_main = df_main.reindex(np.random.permutation(df_main.index)) # shuffle dataset
df_main = df_main.reindex(
np.random.permutation(df_main.index)
) # shuffle dataset

# create empty train, val and test datasets
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

subject_grouped_df_main = df_main.groupby([group_col], sort=False, as_index=False)
category_grouped_df_main = df_main.groupby(cat_col).count()[[group_col]]/len(df_main)*100

#Check inputs
assert (0 <= weight <= 1), "Weight must be between 0 and 1"
total_splits = round((train_pct) + float(test_pct) + float(val_pct),1)
assert (total_splits == 1), "Sum of train_pct, test_pct, and val_pct must equal 1."
assert (batch_size >= 1 and batch_size <= subject_grouped_df_main.ngroups / 10 ), \
"Batch must be greater than 1 and less than 1/10 count of groups"
subject_grouped_df_main = df_main.groupby(
[group_col], sort=False, as_index=False
)
category_grouped_df_main = (
df_main.groupby(cat_col).count()[[group_col]] / len(df_main) * 100
)

# Check inputs
assert 0 <= weight <= 1, "Weight must be between 0 and 1"
total_splits = round((train_pct) + float(test_pct) + float(val_pct), 1)
assert (
total_splits == 1
), "Sum of train_pct, test_pct, and val_pct must equal 1."
assert (
batch_size >= 1 and batch_size <= subject_grouped_df_main.ngroups / 10
), "Batch must be greater than 1 and less than 1/10 count of groups"

def calc_mse_loss(df):
grouped_df = df.groupby(cat_col).count()[[group_col]]/len(df)*100
df_temp = category_grouped_df_main.join(grouped_df, on = cat_col, how = 'left', lsuffix = '_main')
grouped_df = df.groupby(cat_col).count()[[group_col]] / len(df) * 100
df_temp = category_grouped_df_main.join(
grouped_df, on=cat_col, how="left", lsuffix="_main"
)
df_temp.fillna(0, inplace=True)
df_temp['diff'] = (df_temp['img_filename_main'] - df_temp[group_col])**2
mse_loss = np.mean(df_temp['diff'])
df_temp["diff"] = (df_temp["img_filename_main"] - df_temp[group_col]) ** 2
mse_loss = np.mean(df_temp["diff"])
return mse_loss

i = 0 #counter for all items in dataset
b = 0 #counter for the batches
i = 0 # counter for all items in dataset
b = 0 # counter for the batches
batch_df = df_main[0:0]

for _, group in subject_grouped_df_main:
if (i < 3):
if (i == 0):
df_train = df_train.append(pd.DataFrame(group), ignore_index=True)
if i < 3:
if i == 0:
df_train = pd.concat(
[df_train, pd.DataFrame(group)], ignore_index=True
)
i += 1
continue
elif (i == 1):
df_val = df_val.append(pd.DataFrame(group), ignore_index=True)
elif i == 1:
df_val = pd.concat([df_val, pd.DataFrame(group)], ignore_index=True)
i += 1
continue
else:
df_test = df_test.append(pd.DataFrame(group), ignore_index=True)
df_test = pd.concat(
[df_test, pd.DataFrame(group)], ignore_index=True
)
i += 1
continue

#Add groups to the
batch_df = batch_df.append(group)
# Add groups to the
batch_df = pd.concat([batch_df, group])
b += 1
if b < batch_size and i < subject_grouped_df_main.ngroups-3:
if b < batch_size and i < subject_grouped_df_main.ngroups - 3:
i += 1
continue

mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss(df_train.append(batch_df, ignore_index=True))
mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss(df_val.append(batch_df, ignore_index=True))
mse_loss_diff_test = calc_mse_loss(df_test) - calc_mse_loss(df_test.append(batch_df, ignore_index=True))
mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss(
pd.concat([df_train, batch_df], ignore_index=True)
)
mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss(
pd.concat([df_train, batch_df], ignore_index=True)
)
mse_loss_diff_test = calc_mse_loss(df_test) - calc_mse_loss(
pd.concat([df_train, batch_df], ignore_index=True)
)

total_records = len(df_train) + len(df_val) + len(df_test)

len_diff_train = (train_pct - (len(df_train)/total_records))
len_diff_val = (val_pct - (len(df_val)/total_records))
len_diff_test = (test_pct - (len(df_test)/total_records))
len_diff_train = train_pct - (len(df_train) / total_records)
len_diff_val = val_pct - (len(df_val) / total_records)
len_diff_test = test_pct - (len(df_test) / total_records)

len_loss_diff_train = len_diff_train * abs(len_diff_train)
len_loss_diff_val = len_diff_val * abs(len_diff_val)
len_loss_diff_test = len_diff_test * abs(len_diff_test)

loss_train = (weight * mse_loss_diff_train) + ((1-weight) * len_loss_diff_train)
loss_val = (weight * mse_loss_diff_val) + ((1-weight) * len_loss_diff_val)
loss_test = (weight * mse_loss_diff_test) + ((1-weight) * len_loss_diff_test)

if (max(loss_train,loss_val,loss_test) == loss_train):
df_train = df_train.append(batch_df, ignore_index=True)
elif (max(loss_train,loss_val,loss_test) == loss_val):
df_val = df_val.append(batch_df, ignore_index=True)
loss_train = (weight * mse_loss_diff_train) + (
(1 - weight) * len_loss_diff_train
)
loss_val = (weight * mse_loss_diff_val) + ((1 - weight) * len_loss_diff_val)
loss_test = (weight * mse_loss_diff_test) + (
(1 - weight) * len_loss_diff_test
)

if max(loss_train, loss_val, loss_test) == loss_train:
df_train = pd.concat([df_train, batch_df], ignore_index=True)
elif max(loss_train, loss_val, loss_test) == loss_val:
df_val = pd.concat([df_val, batch_df], ignore_index=True)
else:
df_test = df_test.append(batch_df, ignore_index=True)
df_test = pd.concat([df_test, batch_df], ignore_index=True)

#print ("Group " + str(i) + ". loss_train: " + str(loss_train) + " | " + "loss_val: " + str(loss_val) + " | " + "loss_test: " + str(loss_test) + " | ")
# print ("Group " + str(i) + ". loss_train: " + str(loss_train) + " | " + "loss_val: " + str(loss_val) + " | " + "loss_test: " + str(loss_test) + " | ")
i += 1
#Reset the batch
# Reset the batch
b = 0
batch_df = df_main[0:0]


######
# Final prep tasks before returning the split dataframe

#Sometimes the algo will put some rows in the val set even if the split percent was set to zero
#In those cases move the rows from val to test
if round(val_pct,1) == round(0,1):
df_test.append(df_val)
df_val = df_val[0:0] #remove the values from
# Sometimes the algo will put some rows in the val set even if the split percent was set to zero
# In those cases move the rows from val to test
if round(val_pct, 1) == round(0, 1):
pd.concat([df_test, df_val])
df_val = df_val[0:0] # remove the values from

# Apply train, split, val labels to the split collumn
df_train["split"] = "train"
df_test["split"] = "test"
df_val["split"] = "val"

#Apply train, split, val labels to the split collumn
df_train['split'] = 'train'
df_test['split'] = 'test'
df_val['split'] = 'val'
df = pd.concat([df_train, pd.concat([df_test, df_val])])

df = df_train.append(df_test).append(df_val)

assert df.shape == df_main.shape, "Output shape does not match input shape. Data loss has occured."
assert (
df.shape == df_main.shape
), "Output shape does not match input shape. Data loss has occured."

self.dataset.df = df
self.dataset.df = self.dataset.df.reset_index(drop=True)
self.dataset.df = self.dataset.df[schema]
self.dataset.df = self.dataset.df[schema]
2 changes: 1 addition & 1 deletion samples
Submodule samples updated from 63b4f7 to a152a5
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
setup(
name="pylabel",
packages=["pylabel"],
version="0.1.48",
version="0.1.49",
description="Transform, analyze, and visualize computer vision annotations.",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 224e5d8

Please sign in to comment.