From 0846601931411969e2a37fd9a034b3ec09e9facf Mon Sep 17 00:00:00 2001
From: Tim Scherr <tim.scherr@kit.edu>
Date: Mon, 21 Mar 2022 11:07:59 +0100
Subject: [PATCH] adjust for fixed train-val split

---
 eval.py                                       | 65 +++++++----------
 .../training/cell_segmentation_dataset.py     | 72 +++++--------------
 segmentation/training/create_training_sets.py | 41 +++++------
 train.py                                      | 45 ++++++------
 4 files changed, 84 insertions(+), 139 deletions(-)

diff --git a/eval.py b/eval.py
index 8321e9a..b790991 100644
--- a/eval.py
+++ b/eval.py
@@ -24,28 +24,24 @@ def main():
     # Get arguments
     parser = argparse.ArgumentParser(description='Conic Challenge - Evaluation')
     parser.add_argument('--model', '-m', required=True, type=str, help='Model to use')
-    parser.add_argument('--dataset', '-ds', default='conic_patches', type=str, help='"conic_patches" or "lizard"')
     parser.add_argument('--batch_size', '-bs', default=8, type=int, help='Batch size')
     parser.add_argument('--multi_gpu', '-mgpu', default=False, action='store_true', help='Use multiple GPUs')
     parser.add_argument('--save_raw_pred', '-srp', default=False, action='store_true', help='Save raw predictions')
     parser.add_argument('--th_cell', '-tc', default=0.07, nargs='+', help='Threshold for adjusting cell size')
     parser.add_argument('--th_seed', '-ts', default=0.45, nargs='+', help='Threshold for seeds')
     parser.add_argument('--tta', '-tta', default=False, action='store_true', help='Use test-time augmentation')
-    parser.add_argument('--eval_split', '-es', default=80, type=int, help='Train split in %')
     parser.add_argument('--upsample', '-u', default=False, action='store_true', help='Apply rescaling (1.25) for inference')
     parser.add_argument('--calc_perfect_class_metric', '-cpcm', default=False, action='store_true',
                         help='Calculate metric for predicted segmentation and ground truth classification')
     args = parser.parse_args()
 
     # Paths
+    path_data = Path(__file__).parent / 'training_data' / 'conic_fixed_train_valid'
     path_models = Path(__file__).parent / 'models'
     if args.upsample:
-        path_train_data = Path(__file__).parent / 'training_data' / args.dataset / 'upsampled'
+        path_train_data = path_data / 'upsampled'
     else:
-        path_train_data = Path(__file__).parent / 'training_data' / args.dataset / 'original_scale'
-
-    if args.dataset == 'lizard':
-        raise NotImplementedError
+        path_train_data = path_data / 'original_scale'
 
     # Set device for using CPU or GPU
     device, num_gpus = torch.device("cuda" if torch.cuda.is_available() else "cpu"), 1
@@ -54,38 +50,38 @@ def main():
     if args.multi_gpu:
         num_gpus = torch.cuda.device_count()
         
-    # Check if data to evaluate exists
-    if not (path_train_data / 'images.npy').is_file() or not (path_train_data / 'labels.npy').is_file() \
-       or not (path_train_data / 'gts.npy').is_file():
+    # Check if training data (labels_train.npy) already exist
+    if not (path_train_data / 'train_labels.npy').is_file() or not (path_train_data / 'valid_labels.npy').is_file():
         # Create training sets
         print(f'No training data found. Creating training data.\nUse upsampling: {args.upsample}')
-        if not (path_train_data.parent / 'images.npy').is_file():
-            raise Exception('images.npy not found in {}'.format(path_train_data.parent))
-        if not (path_train_data.parent / 'labels.npy').is_file():
-            raise Exception('labels.npy not found in {}'.format(path_train_data.parent))
+        if not (path_data / 'train_imgs.npy').is_file():
+            raise Exception('train_imgs.npy not found in {}'.format(path_data))
+        if not (path_data / 'train_anns.npy').is_file():
+            raise Exception('train_anns.npy not found in {}'.format(path_data))
+        if not (path_data / 'valid_imgs.npy').is_file():
+            raise Exception('valid_imgs.npy not found in {}'.format(path_data))
+        if not (path_data / 'valid_anns.npy').is_file():
+            raise Exception('valid_anns.npy not found in {}'.format(path_data))
         path_train_data.mkdir(exist_ok=True)
-        create_conic_training_sets(path_data=path_train_data.parent,
-                                   path_train_data=path_train_data,
-                                   upsample=args.upsample)
+        create_conic_training_sets(path_data=path_data, path_train_data=path_train_data, upsample=args.upsample,
+                                   mode='train')
+        create_conic_training_sets(path_data=path_data, path_train_data=path_train_data, upsample=args.upsample,
+                                   mode='valid')
 
     # Load model
     model = path_models / "{}.pth".format(args.model)
 
     # Directory for results
-    path_seg_results = path_train_data / f"{model.stem}_{args.eval_split}"
+    path_seg_results = path_train_data / f"{model.stem}"
     path_seg_results.mkdir(exist_ok=True)
     print(f"Evaluation of {model.stem}. Seed thresholds: {args.th_seed}, mask thresholds: {args.th_cell}, "
           f"upsampling: {args.upsample}, tta: {args.tta}")
 
     inference_args = deepcopy(args)
     
-    if args.dataset == "conic_patches":
-        dataset = ConicDataset(root_dir=path_train_data,
-                               mode="eval",
-                               transform=ToTensor(min_value=0, max_value=255),
-                               train_split=args.eval_split)
-    else:
-        raise NotImplementedError(f'Dataset {args.dataset} not implemented')
+    dataset = ConicDataset(root_dir=path_train_data,
+                           mode="eval",
+                           transform=ToTensor(min_value=0, max_value=255))
 
     inference_2d(model=model,
                  dataset=dataset,
@@ -121,20 +117,13 @@ def main():
         else:
             metrics_perfect_class = -1
 
-        # r2 metric
-        pred_counts = pd.read_csv(path_seg_results_th / "counts.csv")
-        gt_counts = dataset.counts
-        gt_counts = gt_counts.sort_index()
-        r2 = get_multi_r2(gt_counts, pred_counts)
-        print(f"  R2: {r2}")
-
-        result = pd.DataFrame([[args.model, args.dataset, args.upsample, th[0], th[1], metrics[0], metrics[1],
-                                metrics_perfect_class, r2, args.tta]],
-                              columns=["model_name", "dataset", "upsampling", "th_cell", "th_seed", "multi_pq+", "pq_metrics_avg",
-                                       "multi_pq+_perfect_class", "R2", "tta"])
+        result = pd.DataFrame([[args.model, args.upsample, th[0], th[1], metrics[0], metrics[1],
+                                metrics_perfect_class, args.tta]],
+                              columns=["model_name", "upsampling", "th_cell", "th_seed", "multi_pq+", "pq_metrics_avg",
+                                       "multi_pq+_perfect_class", "tta"])
         
-        result.to_csv(Path(__file__).parent / f"scores{args.eval_split}.csv",
-                      header=not (Path(__file__).parent / f"scores{args.eval_split}.csv").exists(),
+        result.to_csv(Path(__file__).parent / "scores_post-challenge-analysis.csv",
+                      header=not (Path(__file__).parent / "scores_post-challenge-analysis.csv").exists(),
                       index=False,
                       mode="a")
 
diff --git a/segmentation/training/cell_segmentation_dataset.py b/segmentation/training/cell_segmentation_dataset.py
index b2085c9..80f2f2a 100644
--- a/segmentation/training/cell_segmentation_dataset.py
+++ b/segmentation/training/cell_segmentation_dataset.py
@@ -1,5 +1,4 @@
 import numpy as np
-import pandas as pd
 
 from torch.utils.data import Dataset
 
@@ -7,7 +6,7 @@
 class ConicDataset(Dataset):
     """ Pytorch data set for CoNIC Challenge """
 
-    def __init__(self, root_dir, mode='train', transform=lambda x: x, train_split=80):
+    def __init__(self, root_dir, mode, transform=lambda x: x):
         """
 
         :param root_dir: Directory containing the dataset.
@@ -16,30 +15,29 @@ def __init__(self, root_dir, mode='train', transform=lambda x: x, train_split=80
             :type mode: str
         :param transform: transforms.
             :type transform:
-        :param train_split: percent of the data used for training
-            :type train_split: int
         :return: Dict (image, cell_label, border_label, id).
         """
-
-        imgs = np.load(root_dir/"images.npy")
         
-        if mode in ['train', 'val']:
-            labels = np.load(root_dir / "labels.npy")
-            assert imgs.shape[0] == labels.shape[0], "Missmatch between images.npy and labels_train.npy"
-            counts = pd.read_csv(root_dir / "counts.csv")
+        if mode == 'train':
+            self.imgs = np.load(root_dir / "train_images.npy")
+            self.labels = np.load(root_dir / "train_labels.npy")
+            # Add some randomness
+            ids = np.arange(len(self.imgs))
+            np.random.shuffle(ids)
+            self.imgs = self.imgs[ids]
+            self.labels = self.labels[ids]
+            assert self.imgs.shape[0] == self.labels.shape[0], "Missmatch between images.npy and labels_train.npy"
+        elif mode == 'val':
+            self.imgs = np.load(root_dir / "valid_images.npy")
+            self.labels = np.load(root_dir / "valid_labels.npy")
+            assert self.imgs.shape[0] == self.labels.shape[0], "Missmatch between images.npy and labels_train.npy"
         elif mode == 'eval':
-            labels = np.load(root_dir / "gts.npy").astype(np.int64)  # pytorchs default_colate cannot handle uint16
-            counts = pd.read_csv(root_dir / "counts.csv")
+            self.imgs = np.load(root_dir / "valid_images.npy")
+            self.labels = np.load(root_dir / "valid_gts.npy").astype(np.int64)  # pytorchs default_colate cannot handle uint16
 
         self.root_dir = root_dir
         self.mode = mode
-        self.train_split = train_split
-        self.ids = self.extract_train_val_ids(imgs.shape[0], 0)
-        self.imgs = imgs[self.ids, ...]
-        self.len = len(self.ids)
-        if mode in ['train', 'val', 'eval']:
-            self.labels = labels[self.ids, ...]
-            self.counts = self.get_counts(counts=counts)
+        self.len = len(self.imgs)
         self.transform = transform
 
     def __len__(self):
@@ -48,40 +46,6 @@ def __len__(self):
     def __getitem__(self, idx):
         sample = {'image': np.copy(self.imgs[idx, ...]),
                   'label': np.copy(self.labels[idx, ...]),
-                  'id': self.ids[idx]}
+                  'id': idx}
         sample = self.transform(sample)
         return sample
-    
-    def extract_train_val_ids(self, n_imgs, seed):
-        """
-        
-        :param n_imgs:
-        :param seed:
-        :return:
-        """
-        np.random.seed(seed)  # seed numpy to always get the same images for the same seed
-        ids = np.arange(n_imgs)
-        np.random.shuffle(ids)  # shuffle inplace
-        if self.mode == "train":
-            ids = ids[0:int(np.round(len(ids)*self.train_split/100))]
-        elif self.mode in ["val", "eval"]:
-            ids = ids[int(np.round(len(ids)*self.train_split/100)):]
-        else:  # use all ids
-            pass
-        return ids
-
-    def get_counts(self, counts):
-        """
-
-        :param counts:
-        :type counts: pandas DataFrame
-        :return: sorted nuclear composition DataFrame
-        """
-        total_counts = counts.iloc[self.ids].sum(axis=0)
-        total_counts.name = "counts"
-        total_counts.to_csv(self.root_dir / f"total_counts_{self.mode}_{self.train_split}.csv", index=False)
-        counts = counts.iloc[self.ids]
-        counts = counts.sort_index()
-        counts.to_csv(self.root_dir / f"counts_{self.mode}_{self.train_split}.csv", index=False)
-
-        return counts
diff --git a/segmentation/training/create_training_sets.py b/segmentation/training/create_training_sets.py
index 987b405..baf3652 100644
--- a/segmentation/training/create_training_sets.py
+++ b/segmentation/training/create_training_sets.py
@@ -7,7 +7,7 @@
 from segmentation.training.train_data_representations import distance_label
 
 
-def create_conic_training_sets(path_data, path_train_data, upsample):
+def create_conic_training_sets(path_data, path_train_data, upsample, mode):
     """ Create training sets for CoNIC Challenge data.
 
     :param path_data: Path to the directory containing the CoNIC Challenge data / training data.
@@ -16,16 +16,15 @@ def create_conic_training_sets(path_data, path_train_data, upsample):
         :type path_data: Pathlib Path object.
     :param upsample: Apply upsampling (factor 1.25).
         :type upsample: bool
+    :param mode: 'train' or 'valid'
+        :type mode: str
     :return: None
     """
 
-    imgs = np.load(path_data / "images.npy")
-    gts = np.load(path_data / "labels.npy")
-    counts = pd.read_csv(path_data / "counts.csv")
+    print(f"Create data for mode {mode}.")
 
-    print("0.1/99.9 percentile channel 0: {}".format(np.percentile(imgs[..., 0], (0.1, 99.9))))
-    print("0.1/99.9 percentile channel 1: {}".format(np.percentile(imgs[..., 1], (0.1, 99.9))))
-    print("0.1/99.9 percentile channel 2: {}".format(np.percentile(imgs[..., 2], (0.1, 99.9))))
+    imgs = np.load(path_data / f"{mode}_imgs.npy")
+    gts = np.load(path_data / f"{mode}_anns.npy")
 
     if upsample:  # results for conic patches in 320-by-320 patches
         scale = 1.25
@@ -72,23 +71,21 @@ def create_conic_training_sets(path_data, path_train_data, upsample):
     imgs = np.delete(imgs, np.array(slice_ids), axis=0)
     labels_train = np.delete(labels_train, np.array(slice_ids), axis=0)
     gts = np.delete(gts, np.array(slice_ids), axis=0)
-    counts = counts.drop(slice_ids)
 
-    np.save(path_train_data / "images.npy", imgs)
-    np.save(path_train_data / "labels.npy", labels_train)
-    np.save(path_train_data / "gts.npy", gts)
-    counts.to_csv(path_train_data / "counts.csv", index=False)
+    np.save(path_train_data / f"{mode}_images.npy", imgs)
+    np.save(path_train_data / f"{mode}_labels.npy", labels_train)
+    np.save(path_train_data / f"{mode}_gts.npy", gts)
 
     # save tiffs for imagej visualization
-    tifffile.imsave(path_train_data / "labels_channel_0.tiff", labels_train[..., 0])
-    tifffile.imsave(path_train_data / "labels_channel_1.tiff", labels_train[..., 1])
-    tifffile.imsave(path_train_data / "labels_channel_2.tiff", labels_train[..., 2])
-    tifffile.imsave(path_train_data / "labels_channel_3.tiff", labels_train[..., 3])
-    tifffile.imsave(path_train_data / "labels_channel_4.tiff", labels_train[..., 4])
-    tifffile.imsave(path_train_data / "labels_channel_5.tiff", labels_train[..., 5])
-    tifffile.imsave(path_train_data / "labels_channel_6.tiff", labels_train[..., 6])
-    tifffile.imsave(path_train_data / "gts_instance.tiff", gts[..., 0])
-    tifffile.imsave(path_train_data / "gts_class.tiff", gts[..., 1])
-    tifffile.imsave(path_train_data / "images.tiff", imgs)
+    tifffile.imsave(path_train_data / f"{mode}_labels_channel_0.tiff", labels_train[..., 0])
+    tifffile.imsave(path_train_data / f"{mode}_labels_channel_1.tiff", labels_train[..., 1])
+    tifffile.imsave(path_train_data / f"{mode}_labels_channel_2.tiff", labels_train[..., 2])
+    tifffile.imsave(path_train_data / f"{mode}_labels_channel_3.tiff", labels_train[..., 3])
+    tifffile.imsave(path_train_data / f"{mode}_labels_channel_4.tiff", labels_train[..., 4])
+    tifffile.imsave(path_train_data / f"{mode}_labels_channel_5.tiff", labels_train[..., 5])
+    tifffile.imsave(path_train_data / f"{mode}_labels_channel_6.tiff", labels_train[..., 6])
+    tifffile.imsave(path_train_data / f"{mode}_gts_instance.tiff", gts[..., 0])
+    tifffile.imsave(path_train_data / f"{mode}_gts_class.tiff", gts[..., 1])
+    tifffile.imsave(path_train_data / f"{mode}_images.tiff", imgs)
     
     return None
diff --git a/train.py b/train.py
index c876fa4..71175f8 100644
--- a/train.py
+++ b/train.py
@@ -24,7 +24,6 @@ def main():
     parser = argparse.ArgumentParser(description='Conic Challenge - Training')
     parser.add_argument('--model_name', '-m', default='conic_model', type=str,
                         help='Building block for the unique model name. Best use a suffix, e.g., "conic_model_mb')
-    parser.add_argument('--dataset', '-ds', default='conic_patches', type=str, help='"conic_patches" or "lizard"')
     parser.add_argument('--act_fun', '-af', default='relu', type=str, help='Activation function')
     parser.add_argument('--batch_size', '-bs', default=8, type=int, help='Batch size')
     parser.add_argument('--classes', '-c', default=6, type=int, help='Classes to predict')
@@ -34,7 +33,6 @@ def main():
     parser.add_argument('--norm_method', '-nm', default='bn', type=str, help='Normalization method')
     parser.add_argument('--optimizer', '-o', default='adam', type=str, help='Optimizer')
     parser.add_argument('--pool_method', '-pm', default='conv', type=str, help='Pool method')
-    parser.add_argument('--train_split', '-ts', default=80, type=int, help='Train split in %')
     parser.add_argument('--upsample', '-u', default=False, action='store_true', help='Apply rescaling (1.25)')
     parser.add_argument('--channels_in', '-cin', default=3, type=int, help="Number of input channels")
     parser.add_argument('--max_epochs', '-me', default=None, type=int, help='Maximum number of epochs (None: auto defined)')
@@ -58,16 +56,13 @@ def main():
                          f"({len(args.weightmap_weights)})")
 
     # Paths
-    path_data = Path(__file__).parent / 'training_data' / args.dataset
+    path_data = Path(__file__).parent / 'training_data' / 'conic_fixed_train_valid'
     path_models = Path(__file__).parent / 'models'
     if args.upsample:
         path_train_data = path_data / 'upsampled'
     else:
         path_train_data = path_data / 'original_scale'
 
-    if args.dataset == 'lizard':
-        raise NotImplementedError
-
     # Set device for using CPU or GPU
     device, num_gpus = torch.device("cuda" if torch.cuda.is_available() else "cpu"), 1
     if str(device) == 'cuda':
@@ -76,19 +71,24 @@ def main():
         num_gpus = torch.cuda.device_count()
 
     # Check if training data (labels_train.npy) already exist
-    if not (path_train_data / 'images.npy').is_file() or not (path_train_data / 'labels.npy').is_file() \
-       or not (path_train_data / 'gts.npy').is_file():
+    if not (path_train_data / 'train_labels.npy').is_file() or not (path_train_data / 'valid_labels.npy').is_file():
         # Create training sets
         print(f'No training data found. Creating training data.\nUse upsampling: {args.upsample}')
-        if not (path_data / 'images.npy').is_file():
-            raise Exception('images.npy not found in {}'.format(path_data))
-        if not (path_data / 'labels.npy').is_file():
-            raise Exception('labels.npy not found in {}'.format(path_data))
+        if not (path_data / 'train_imgs.npy').is_file():
+            raise Exception('train_imgs.npy not found in {}'.format(path_data))
+        if not (path_data / 'train_anns.npy').is_file():
+            raise Exception('train_anns.npy not found in {}'.format(path_data))
+        if not (path_data / 'valid_imgs.npy').is_file():
+            raise Exception('valid_imgs.npy not found in {}'.format(path_data))
+        if not (path_data / 'valid_anns.npy').is_file():
+            raise Exception('valid_anns.npy not found in {}'.format(path_data))
         path_train_data.mkdir(exist_ok=True)
-        create_conic_training_sets(path_data=path_data, path_train_data=path_train_data, upsample=args.upsample)
+        create_conic_training_sets(path_data=path_data, path_train_data=path_train_data, upsample=args.upsample,
+                                   mode='train')
+        create_conic_training_sets(path_data=path_data, path_train_data=path_train_data, upsample=args.upsample,
+                                   mode='valid')
 
     # Train model
-
     run_name = utils.unique_path(path_models, args.model_name + '_{:02d}.pth').stem
 
     # Get CNN (double encoder U-Net)
@@ -105,7 +105,6 @@ def main():
                      'optimizer': args.optimizer,
                      'run_name': run_name,
                      'max_epochs': args.max_epochs,
-                     'train_split': args.train_split,
                      'upsample': args.upsample,
                      'loss_fraction_weights': args.loss_fraction_weights,
                      'weightmap_weights': args.weightmap_weights
@@ -123,14 +122,10 @@ def main():
     # Load training and validation set
     data_transforms = augmentors(min_value=0, max_value=255)
     train_configs['data_transforms'] = str(data_transforms)
-    if args.dataset == "conic_patches":
-        datasets = {x: ConicDataset(root_dir=path_train_data,
-                                    mode=x,
-                                    transform=data_transforms[x],
-                                    train_split=args.train_split)
-                    for x in ['train', 'val']}
-    else:
-        raise NotImplementedError(f'Dataset {args.dataset} not implemented')
+    datasets = {x: ConicDataset(root_dir=path_train_data,
+                                mode=x,
+                                transform=data_transforms[x])
+                for x in ['train', 'val']}
 
     if not train_configs['max_epochs']:  # Get number of training epochs depending on dataset size if not given
         train_configs['max_epochs'] = get_max_epochs(len(datasets['train']) + len(datasets['val']))
@@ -138,8 +133,8 @@ def main():
     # Train model
     best_loss = train(net=net, datasets=datasets, configs=train_configs, device=device, path_models=path_models)
 
-    # Fine-tune with cosine annealing for Ranger models
-    # Does not help in most cases (also our submitted model is without cosine annealing)
+    # Fine-tune with cosine annealing for Ranger models (does not really work and the challenge model is a model
+    # without cosine annealing.
     # if train_configs['optimizer'] == 'ranger':
     #     net = build_unet(act_fun=train_configs['architecture'][2],
     #                      pool_method=train_configs['architecture'][1],