whittle-org · aaronkl · Aug 22, 2024 · Aug 21, 2024
diff --git a/whittle/loss/kd_loss.py b/whittle/loss/kd_loss.py
@@ -50,6 +50,7 @@ def forward(self, outputs, labels, outputs_teacher):
             torch.Tensor: The combined loss.
         """
         soft_target_loss = 0
+        outputs_teacher = outputs_teacher.detach()
         if outputs_teacher is not None and self.distillation_weight > 0:
             soft_target_loss = self.kldiv(
                 F.log_softmax(outputs / self.temperature, dim=1),

diff --git a/whittle/training_strategies/ats.py b/whittle/training_strategies/ats.py
@@ -35,7 +35,7 @@ def __call__(self, model, inputs, outputs, **kwargs):
         super-network.
         """
         total_loss = 0
-        y_supernet = model(inputs).detach()
+        y_supernet = model(inputs)
         if self.current_step % 2 == 0:
             # update random sub-networks
             for i in range(self.random_samples):

diff --git a/whittle/training_strategies/base_strategy.py b/whittle/training_strategies/base_strategy.py
@@ -19,7 +19,7 @@ def __init__(
         self,
         sampler: RandomSampler,
         loss_function: Callable,
-        kd_loss: DistillLoss | None = None,
+        kd_loss: Callable | None = None,
         device: str = "cuda",
         **kwargs,
     ):
@@ -35,7 +35,7 @@ def __init__(
         self.loss_function = loss_function
         self.device = device
         self.kd_loss = kd_loss
-        if self.kd_loss is not None:
+        if isinstance(self.kd_loss, DistillLoss):
             if not isinstance(loss_function, torch.nn.CrossEntropyLoss):
                 raise TypeError(
                     "KD Loss not yet supported: Expected torch.nn.CrossEntropyLoss"

diff --git a/whittle/training_strategies/random.py b/whittle/training_strategies/random.py
@@ -10,7 +10,7 @@ class RandomStrategy(BaseTrainingStrategy):
     Randomly samples and updates `random_samples` sub-networks in each step.
     """
 
-    def __init__(self, random_samples=1, **kwargs):
+    def __init__(self, random_samples: int = 1, **kwargs):
         """
         Initialises a `RandomStrategy`
 
@@ -24,7 +24,7 @@ def __init__(self, random_samples=1, **kwargs):
     def __call__(self, model, inputs, outputs, **kwargs):
         """Updates randomly sampled sub-networks in each step."""
         total_loss = 0
-        y_supernet = model(inputs).detach()
+        y_supernet = model(inputs)
         for i in range(self.random_samples):
             config = self.sampler.sample()
             model.select_sub_network(config)

diff --git a/whittle/training_strategies/random_linear.py b/whittle/training_strategies/random_linear.py
@@ -40,7 +40,7 @@ def __init__(self, total_number_of_steps: int, random_samples: int = 1, **kwargs
 
     def __call__(self, model, inputs, outputs, **kwargs):
         total_loss = 0
-        y_supernet = model(inputs).detach()
+        y_supernet = model(inputs)
         if np.random.rand() <= self.rate[self.current_step]:
             # update random sub-networks
             for i in range(self.random_samples):

diff --git a/whittle/training_strategies/sandwich.py b/whittle/training_strategies/sandwich.py
@@ -17,7 +17,7 @@ class SandwichStrategy(BaseTrainingStrategy):
         https://arxiv.org/abs/1903.05134
     """
 
-    def __init__(self, random_samples=2, **kwargs):
+    def __init__(self, random_samples: int = 2, **kwargs):
         """
         Initialises a `SandwichStrategy`
 
@@ -42,7 +42,7 @@ def __call__(self, model, inputs, outputs, **kwargs):
             model.select_sub_network(config)
             y_hat = model(inputs)
             if self.kd_loss is not None:
-                loss = self.kd_loss(y_hat, outputs, y_supernet.detach())
+                loss = self.kd_loss(y_hat, outputs, y_supernet)
             else:
                 loss = self.loss_function(y_hat, outputs)
             loss.backward()
@@ -54,7 +54,7 @@ def __call__(self, model, inputs, outputs, **kwargs):
         model.select_sub_network(config)
         y_hat = model(inputs)
         if self.kd_loss is not None:
-            loss = self.kd_loss(y_hat, outputs, y_supernet.detach())
+            loss = self.kd_loss(y_hat, outputs, y_supernet)
         else:
             loss = self.loss_function(y_hat, outputs)
         loss.backward()