From b9029c5c8a5162ce4ee5db59d3247c075c383f97 Mon Sep 17 00:00:00 2001
From: mohsenal-zeqri <mmalzzaqri15@eng.just.edu.jo>
Date: Sat, 16 Nov 2024 15:31:25 +0100
Subject: [PATCH 01/11] fix: addition and standardization of docstrings

---
 CONTRIBUTING.md                      | 14 +++++++++++
 whittle/models/gpt/extract.py        | 18 +++++++++++++-
 whittle/models/gpt/utils.py          |  3 ++-
 whittle/modules/embedding.py         |  4 ++++
 whittle/modules/layernorm.py         |  4 ++++
 whittle/modules/linear.py            |  5 +++-
 whittle/sampling/random_sampler.py   | 35 +++++++++++++++++++++++++---
 whittle/search/ask_tell_scheduler.py | 15 ++++++++----
 whittle/search/local_search.py       | 28 +++++++++++-----------
 whittle/search/multi_objective.py    | 11 ++++++---
 whittle/search/search.py             | 30 +++++++++++++++---------
 11 files changed, 128 insertions(+), 39 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e9afa97a..8e55017c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,6 +21,20 @@ pre-commit install
 pytest
 ```
 
+## Docstring Writing Guidelines
+
+When adding or updating functions or classes, please ensure that each has a docstring that follows this format:
+
+- **Summary**: A brief description of what the function or class does.
+- **args**: List each argument with its name, and a short description of its purpose.
+- **return**: Describe the return value, including what it represents.
+**Note**: After adding or updating the docstring, ensure that the code passes the following command with **no warnings**:
+
+```bash
+mkdocs build --clean --strict
+```
+
+
 ## Conventional commits and Commitizen
 
 We use [commitizen](https://commitizen-tools.github.io/commitizen/) to manage commits.
diff --git a/whittle/models/gpt/extract.py b/whittle/models/gpt/extract.py
index 04d37407..b99405f8 100644
--- a/whittle/models/gpt/extract.py
+++ b/whittle/models/gpt/extract.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+
 import torch.nn as nn
 
 from collections import OrderedDict
@@ -8,9 +9,24 @@
 from whittle.models.gpt.blocks.mlp import GptNeoxMLP, LLaMAMLP
 from whittle.modules.layernorm import LayerNorm
 from whittle.modules.rmsnorm import RMSNorm
+from litgpt import Config
+
+
+def extract_sub_network(model: GPT, sub_network_config: Config) -> GPT:
+    """
+    Extracts a sub-network from a given model based on the specified sub-network configuration.
+    Copies relevant layers, weights, and configurations from the full model into a sub-network model.
+
+    Args:
+        model: The original, full GPT model from which the sub-network is extracted.
+        sub_network_config: Configuration object for the sub-network, containing the necessary
+                                     architecture specifications such as embedding size, number of heads,
+                                     and number of layers.
 
+    Returns:
+        GPT: A new sub-network model instance, initialized with parameters extracted from the original model.
+    """
 
-def extract_sub_network(model, sub_network_config):
     sub_network = GPT(sub_network_config)
 
     state_dict = extract_linear(model.lm_head)
diff --git a/whittle/models/gpt/utils.py b/whittle/models/gpt/utils.py
index ef3ddde2..2cf2ea86 100644
--- a/whittle/models/gpt/utils.py
+++ b/whittle/models/gpt/utils.py
@@ -324,7 +324,8 @@ def map_old_state_dict_weights(state_dict: dict, mapping: Mapping, prefix: str)
 
 
 def get_default_supported_precision(training: bool) -> str:
-    """Return default precision that is supported by the hardware: either `bf16` or `16`.
+    """
+    Return default precision that is supported by the hardware: either `bf16` or `16`.
 
     Args:
         training: `-mixed` or `-true` version of the precision to use
diff --git a/whittle/modules/embedding.py b/whittle/modules/embedding.py
index 94045336..abe5ab45 100644
--- a/whittle/modules/embedding.py
+++ b/whittle/modules/embedding.py
@@ -6,6 +6,8 @@
 
 
 class Embedding(torch.nn.Embedding):
+    """An extension of PyTorch's `torch.nn.Embedding` with support of sub-network dimensionality."""
+
     def __init__(
         self,
         num_embeddings: int,
@@ -34,9 +36,11 @@ def __init__(
         self.sub_network_embedding_dim: int | None = embedding_dim
 
     def set_sub_network(self, sub_network_embedding_dim: int):
+        """Set the embedding dimensionality of the current sub-network."""
         self.sub_network_embedding_dim = sub_network_embedding_dim
 
     def reset_super_network(self):
+        """Reset the embedding dimensionality of the current sub-network to the original value."""
         self.sub_network_embedding_dim = self.embedding_dim
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/whittle/modules/layernorm.py b/whittle/modules/layernorm.py
index cc1bc7d3..0c5c1766 100644
--- a/whittle/modules/layernorm.py
+++ b/whittle/modules/layernorm.py
@@ -5,6 +5,8 @@
 
 
 class LayerNorm(torch.nn.LayerNorm):
+    """An extension of PyTorch's `torch.nn.LayerNorm` with support of sub-network dimensionality."""
+
     def __init__(self, in_features: int, eps: float = 1e-5):
         super().__init__(in_features, eps)
         self.in_features = in_features
@@ -13,9 +15,11 @@ def __init__(self, in_features: int, eps: float = 1e-5):
         self.sub_network_in_features = self.in_features
 
     def set_sub_network(self, sub_network_in_features: int):
+        """Set the input dimensionality of the current sub-network."""
         self.sub_network_in_features = sub_network_in_features
 
     def reset_super_network(self):
+        """Reset the input dimensionality of the current sub-network to the original value."""
         self.sub_network_in_features = self.in_features
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/whittle/modules/linear.py b/whittle/modules/linear.py
index 5bbb5c74..e36f5594 100644
--- a/whittle/modules/linear.py
+++ b/whittle/modules/linear.py
@@ -6,6 +6,8 @@
 
 
 class Linear(nn.Linear):
+    """An extension of PyTorch's `torch.nn.Linear` with support of sub-network dimensionality."""
+
     def __init__(
         self,
         in_features: int,
@@ -14,7 +16,6 @@ def __init__(
         device=None,
         dtype=None,
     ):
-        """ """
         super().__init__(in_features, out_features, bias, device, dtype)
 
         # Set the current sub-network dimensions equal to super-network
@@ -25,10 +26,12 @@ def __init__(
     def set_sub_network(
         self, sub_network_in_features: int, sub_network_out_features: int
     ):
+        """Set the linear transformation dimensions of the current sub-network."""
         self.sub_network_in_features = sub_network_in_features
         self.sub_network_out_features = sub_network_out_features
 
     def reset_super_network(self):
+        """Reset the linear transformation dimensions of the current sub-network to the original value."""
         self.sub_network_in_features = self.in_features
         self.sub_network_out_features = self.out_features
 
diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py
index c569493e..1fd051cc 100644
--- a/whittle/sampling/random_sampler.py
+++ b/whittle/sampling/random_sampler.py
@@ -1,23 +1,45 @@
 from __future__ import annotations
 import warnings
 
+from typing import Any
+
 import numpy as np
 from syne_tune.config_space import Categorical, Domain
 
 
 class RandomSampler:
+    """
+    RandomSampler samples configurations from a given configuration space using a random state.
+
+    Args:
+        config_space: The configuration space from which to sample.
+        seed: Seed for the random number generator. Defaults to None.
+    """
+
     def __init__(self, config_space: dict, seed: int | None = None):
         self.config_space = config_space
         self.rng = np.random.RandomState(seed)
 
-    def sample(self):
+    def sample(self) -> dict[str, Any]:
+        """
+        Gets the smallest sub-network configuration from the configuration space.
+
+        Returns:
+            dict: The smallest sub-network configuration.
+        """
         config = {}
         for hp_name, hparam in self.config_space.items():
             if isinstance(hparam, Domain):
                 config[hp_name] = hparam.sample(random_state=self.rng)
         return config
 
-    def get_smallest_sub_network(self):
+    def get_smallest_sub_network(self) -> dict[str, Any]:
+        """
+        Gets the smallest sub-network configuration from the configuration space.
+
+        Returns:
+            dict: The smallest sub-network configuration.
+        """
         config = {}
         for k, v in self.config_space.items():
             if isinstance(v, Domain):
@@ -33,7 +55,14 @@ def get_smallest_sub_network(self):
                     config[k] = v.lower
         return config
 
-    def get_largest_sub_network(self):
+    def get_largest_sub_network(self) -> dict[str, Any]:
+        """
+        gets the largest sub-network configuration from the configuration space.
+
+        Returns:
+            dict: The largest sub-network configuration.
+        """
+
         config = {}
         for k, v in self.config_space.items():
             if isinstance(v, Domain):
diff --git a/whittle/search/ask_tell_scheduler.py b/whittle/search/ask_tell_scheduler.py
index 64777b70..848dc8b5 100644
--- a/whittle/search/ask_tell_scheduler.py
+++ b/whittle/search/ask_tell_scheduler.py
@@ -19,7 +19,9 @@ def __init__(self, base_scheduler: TrialScheduler):
     def ask(self) -> Trial:
         """
         Ask the scheduler for new trial to run
-        :return: Trial to run
+
+        Returns:
+            Trial to run
         """
         trial_suggestion = self.bscheduler.suggest(self.trial_counter)
         trial = Trial(
@@ -32,10 +34,12 @@ def ask(self) -> Trial:
 
     def tell(self, trial: Trial, experiment_result: dict[str, float]):
         """
-        Feed experiment results back to the Scheduler
+        Feed experiment results back to the Scheduler.
+
+        Args:
+            trial: Trial that was run.
+            experiment_result: {metric: value} dictionary with experiment results.
 
-        :param trial: Trial that was run
-        :param experiment_result: {metric: value} dictionary with experiment results
         """
         trial_result = trial.add_results(
             metrics=experiment_result,
@@ -47,7 +51,8 @@ def tell(self, trial: Trial, experiment_result: dict[str, float]):
 
     def best_trial(self, metris: str) -> TrialResult:
         """
-        Return the best trial according to the provided metric
+        Returns:
+            the best trial according to the provided metric.
         """
         if self.bscheduler.mode == "max":
             sign = 1.0
diff --git a/whittle/search/local_search.py b/whittle/search/local_search.py
index a3e479f5..a82d5acc 100644
--- a/whittle/search/local_search.py
+++ b/whittle/search/local_search.py
@@ -27,21 +27,21 @@ class PopulationElement:
 
 class LS(FIFOScheduler):
     """
-
     See :class:`~syne_tune.optimizer.schedulers.searchers.RandomSearcher`
-    for ``kwargs["search_options"]`` parameters.
-
-    :param config_space: Configuration space for evaluation function
-    :param metric: Name of metric to optimize
-    :param population_size: See
-        :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`.
-        Defaults to 100
-    :param sample_size: See
-        :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`.
-        Defaults to 10
-    :param random_seed: Random seed, optional
-    :param kwargs: Additional arguments to
-        :class:`~syne_tune.optimizer.schedulers.FIFOScheduler`
+       for ``kwargs["search_options"]`` parameters.
+
+       Args:
+           config_space: Configuration space for evaluation function
+           metric: Name of metric to optimize
+           population_size: See
+               :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`.
+               Defaults to 100
+           sample_size: See
+               :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`.
+               Defaults to 10
+           random_seed: Random seed, optional.
+           kwargs: Additional arguments to
+               :class:`~syne_tune.optimizer.schedulers.FIFOScheduler`
     """
 
     def __init__(
diff --git a/whittle/search/multi_objective.py b/whittle/search/multi_objective.py
index bae442a3..a5eac5e7 100644
--- a/whittle/search/multi_objective.py
+++ b/whittle/search/multi_objective.py
@@ -5,10 +5,15 @@
 
 
 def get_pareto_optimal(costs: np.ndarray) -> npt.NDArray[np.bool_]:
-    """Find the pareto-optimal point.
+    """
+    Find the pareto-optimal point.
+
+    Args:
+        costs: (n_points, m_cost_values) array
+
+    Returns:
+        (n_points, 1) indicator if point is on pareto front or not.
 
-    :param costs: (n_points, m_cost_values) array
-    :return: (n_points, 1) indicator if point is on pareto front or not.
     """
     assert isinstance(costs, np.ndarray)
     assert costs.ndim == 2
diff --git a/whittle/search/search.py b/whittle/search/search.py
index e9fe095d..1d8df89c 100644
--- a/whittle/search/search.py
+++ b/whittle/search/search.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import time
+from typing import Any
 
 import numpy as np
 
@@ -16,19 +17,26 @@ def multi_objective_search(
     num_samples: int = 100,
     objective_kwargs: dict | None = None,
     seed: int | None = None,
-):
-    """
-    Search for the Pareto optimal sub-networks.
-
-    :param objective: the objective function to optimize.
-    :param search_space: the search space.
-    :param search_strategy: the search strategy.
-    :param objective_kwargs: the keyword arguments for the objective function.
-    :param num_samples: the number of samples to take.
-    :param seed: the random seed.
-    :return: the results of the search.
+) -> dict[str, Any]:
     """
+    Search for the Pareto-optimal sub-networks using the specified strategy.
+
+    Args:
+        objective (callable): The objective function to optimize.
+        search_space: The search space for the optimization.
+        search_strategy: The search strategy to use.
+            Defaults to "random_search".
+        num_samples: The number of samples to evaluate.
+            Defaults to 100.
+        objective_kwargs: Keyword arguments for the objective function.
+            Defaults to None.
+        seed: The random seed for reproducibility.
+            Defaults to None.
 
+    Returns:
+        dict: The results of the search, including Pareto-optimal solutions.
+
+    """
     metrics = ["objective_1", "objective_2"]
     if seed is None:
         seed = np.random.randint(0, 1000000)

From ff4144e755c18e386b3b19874a14eb192f6108c2 Mon Sep 17 00:00:00 2001
From: mohsenal-zeqri <mmalzzaqri15@eng.just.edu.jo>
Date: Sun, 17 Nov 2024 17:36:08 +0100
Subject: [PATCH 02/11] enhance: modified and added more docstrings

---
 whittle/models/gpt/blocks/causal_self_attention.py |  4 ++++
 whittle/models/gpt/blocks/mlp.py                   | 10 ++++++++++
 whittle/modules/embedding.py                       |  4 ++--
 whittle/modules/layernorm.py                       |  4 ++--
 whittle/modules/linear.py                          |  4 ++--
 whittle/sampling/random_sampler.py                 | 10 +++++-----
 6 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/whittle/models/gpt/blocks/causal_self_attention.py b/whittle/models/gpt/blocks/causal_self_attention.py
index 07420ec8..272201eb 100644
--- a/whittle/models/gpt/blocks/causal_self_attention.py
+++ b/whittle/models/gpt/blocks/causal_self_attention.py
@@ -11,6 +11,8 @@
 
 
 class CausalSelfAttention(nn.Module):
+    """Extension of litgpt's `litgpt.model.CausalSelfAttention` with support to adapt to sub-network dimensionality."""
+
     def __init__(self, config: Config, block_idx: int) -> None:
         super().__init__()
         shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
@@ -48,6 +50,7 @@ def set_sub_network(
         sub_network_query_groups: int,
         sub_network_head_size: int,
     ):
+        """Sets the CausalSelfAttention block to the specified sub-network dimensionality."""
         self.sub_network_n_embd = sub_network_n_embd
         self.sub_network_n_head = sub_network_n_head
         self.sub_network_query_groups = sub_network_query_groups
@@ -73,6 +76,7 @@ def set_sub_network(
             self.sub_attention_scaler = self.config.attention_scores_scalar
 
     def reset_super_network(self):
+        """Resets the dimensionality of the current to the super-network dimensionality."""
         self.sub_network_n_embd = self.config.n_embd
         self.sub_network_n_head = self.config.n_head
         self.sub_network_head_size = self.config.head_size
diff --git a/whittle/models/gpt/blocks/mlp.py b/whittle/models/gpt/blocks/mlp.py
index c87b9744..0618d215 100644
--- a/whittle/models/gpt/blocks/mlp.py
+++ b/whittle/models/gpt/blocks/mlp.py
@@ -9,6 +9,8 @@
 
 
 class GptNeoxMLP(litgpt.model.GptNeoxMLP):
+    """An extension of litgp's `litgpt.model.GptNeoxMLP` with support to adapt to sub-network dimensionality."""
+
     def __init__(self, config: Config) -> None:
         super().__init__(config)
         self.fc = Linear(config.n_embd, config.intermediate_size, bias=config.bias)
@@ -24,6 +26,7 @@ def __init__(self, config: Config) -> None:
     def set_sub_network(
         self, sub_network_n_embd: int, sub_network_intermediate_size: int
     ):
+        """Set the input dimensionality of the current sub-network."""
         self.sub_network_n_embd = sub_network_n_embd
         self.sub_network_intermediate_size = sub_network_intermediate_size
 
@@ -35,6 +38,7 @@ def set_sub_network(
         )
 
     def reset_super_network(self):
+        """Reset the input dimensionality of the current sub-network to the super-network dimensionality."""
         self.sub_network_n_embd = self.in_features
         self.sub_network_intermediate_size = self.intermediate_size
 
@@ -43,6 +47,8 @@ def reset_super_network(self):
 
 
 class LLaMAMLP(litgpt.model.LLaMAMLP):
+    """An extension of litgp's `litgpt.model.LLaMAMLP` with support to adapt to sub-network dimensionality."""
+
     def __init__(self, config: Config) -> None:
         super().__init__(config)
         self.fc_1 = Linear(config.n_embd, config.intermediate_size, bias=config.bias)
@@ -57,6 +63,7 @@ def __init__(self, config: Config) -> None:
     def set_sub_network(
         self, sub_network_n_embd: int, sub_network_intermediate_size: int
     ):
+        """Set the input dimensionality of the current sub-network."""
         self.sub_network_n_embd = sub_network_n_embd
         self.sub_network_intermediate_size = sub_network_intermediate_size
 
@@ -71,6 +78,7 @@ def set_sub_network(
         )
 
     def reset_super_network(self):
+        """Reset the input dimensionality of the current sub-network to the super-network dimensionality."""
         self.sub_network_n_embd = self.in_features
         self.sub_network_intermediate_size = self.intermediate_size
 
@@ -80,6 +88,8 @@ def reset_super_network(self):
 
 
 class GemmaMLP(LLaMAMLP):
+    """ "Implementation of the forward pass of LLaMAMLP network."""
+
     def __init__(self, config: Config) -> None:
         super().__init__(config)
 
diff --git a/whittle/modules/embedding.py b/whittle/modules/embedding.py
index abe5ab45..771ecbee 100644
--- a/whittle/modules/embedding.py
+++ b/whittle/modules/embedding.py
@@ -6,7 +6,7 @@
 
 
 class Embedding(torch.nn.Embedding):
-    """An extension of PyTorch's `torch.nn.Embedding` with support of sub-network dimensionality."""
+    "An extension of PyTorch's torch.nn.Embedding with support to sub-sample weights corresponding to the sub-network dimensionality"
 
     def __init__(
         self,
@@ -40,7 +40,7 @@ def set_sub_network(self, sub_network_embedding_dim: int):
         self.sub_network_embedding_dim = sub_network_embedding_dim
 
     def reset_super_network(self):
-        """Reset the embedding dimensionality of the current sub-network to the original value."""
+        """Reset the embedding dimensionality of the current sub-network to the super-network dimensionality"""
         self.sub_network_embedding_dim = self.embedding_dim
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/whittle/modules/layernorm.py b/whittle/modules/layernorm.py
index 0c5c1766..bcc833fe 100644
--- a/whittle/modules/layernorm.py
+++ b/whittle/modules/layernorm.py
@@ -5,7 +5,7 @@
 
 
 class LayerNorm(torch.nn.LayerNorm):
-    """An extension of PyTorch's `torch.nn.LayerNorm` with support of sub-network dimensionality."""
+    """An extension of PyTorch's `torch.nn.LayerNorm` with support  with support to sub-sample weights corresponding to the sub-network dimensionality."""
 
     def __init__(self, in_features: int, eps: float = 1e-5):
         super().__init__(in_features, eps)
@@ -19,7 +19,7 @@ def set_sub_network(self, sub_network_in_features: int):
         self.sub_network_in_features = sub_network_in_features
 
     def reset_super_network(self):
-        """Reset the input dimensionality of the current sub-network to the original value."""
+        """Reset the input dimensionality of the current sub-network to the super-network dimensionality."""
         self.sub_network_in_features = self.in_features
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/whittle/modules/linear.py b/whittle/modules/linear.py
index e36f5594..08db0bdd 100644
--- a/whittle/modules/linear.py
+++ b/whittle/modules/linear.py
@@ -6,7 +6,7 @@
 
 
 class Linear(nn.Linear):
-    """An extension of PyTorch's `torch.nn.Linear` with support of sub-network dimensionality."""
+    """An extension of PyTorch's `torch.nn.Linear` with support to sub-sample weights corresponding to the sub-network dimensionality"""
 
     def __init__(
         self,
@@ -31,7 +31,7 @@ def set_sub_network(
         self.sub_network_out_features = sub_network_out_features
 
     def reset_super_network(self):
-        """Reset the linear transformation dimensions of the current sub-network to the original value."""
+        """Reset the linear transformation dimensions of the current sub-network to the super-network dimensionality."""
         self.sub_network_in_features = self.in_features
         self.sub_network_out_features = self.out_features
 
diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py
index 1fd051cc..cce8b840 100644
--- a/whittle/sampling/random_sampler.py
+++ b/whittle/sampling/random_sampler.py
@@ -9,10 +9,10 @@
 
 class RandomSampler:
     """
-    RandomSampler samples configurations from a given configuration space using a random state.
+    RandomSampler samples configurations from a given search space using a random state.
 
     Args:
-        config_space: The configuration space from which to sample.
+        config_space: The search space from which to sample.
         seed: Seed for the random number generator. Defaults to None.
     """
 
@@ -22,7 +22,7 @@ def __init__(self, config_space: dict, seed: int | None = None):
 
     def sample(self) -> dict[str, Any]:
         """
-        Gets the smallest sub-network configuration from the configuration space.
+        Gets the smallest sub-network configuration from the search space.
 
         Returns:
             dict: The smallest sub-network configuration.
@@ -35,7 +35,7 @@ def sample(self) -> dict[str, Any]:
 
     def get_smallest_sub_network(self) -> dict[str, Any]:
         """
-        Gets the smallest sub-network configuration from the configuration space.
+        Gets the smallest sub-network configuration from the search space.
 
         Returns:
             dict: The smallest sub-network configuration.
@@ -57,7 +57,7 @@ def get_smallest_sub_network(self) -> dict[str, Any]:
 
     def get_largest_sub_network(self) -> dict[str, Any]:
         """
-        gets the largest sub-network configuration from the configuration space.
+        gets the largest sub-network configuration from the search space.
 
         Returns:
             dict: The largest sub-network configuration.

From 1754bdddbda546490cb2ac0f2a8aaac6352e05ec Mon Sep 17 00:00:00 2001
From: mohsenal-zeqri <mmalzzaqri15@eng.just.edu.jo>
Date: Mon, 18 Nov 2024 15:46:22 +0100
Subject: [PATCH 03/11] fix: modified linear.py docstring

---
 whittle/modules/linear.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/modules/linear.py b/whittle/modules/linear.py
index 08db0bdd..873ccbb4 100644
--- a/whittle/modules/linear.py
+++ b/whittle/modules/linear.py
@@ -6,7 +6,7 @@
 
 
 class Linear(nn.Linear):
-    """An extension of PyTorch's `torch.nn.Linear` with support to sub-sample weights corresponding to the sub-network dimensionality"""
+    """An extension of PyTorch's torch.nn.Linear with flexible input and output dimensionality corresponding to sub-network"""
 
     def __init__(
         self,

From a26bda032d790b8fc625c57b06f7da29b5c38639 Mon Sep 17 00:00:00 2001
From: zeqri <137073825+zeqri@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:19:13 +0100
Subject: [PATCH 04/11] Update whittle/models/gpt/blocks/mlp.py

Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com>
---
 whittle/models/gpt/blocks/mlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/models/gpt/blocks/mlp.py b/whittle/models/gpt/blocks/mlp.py
index 0618d215..ab4e74c5 100644
--- a/whittle/models/gpt/blocks/mlp.py
+++ b/whittle/models/gpt/blocks/mlp.py
@@ -88,7 +88,7 @@ def reset_super_network(self):
 
 
 class GemmaMLP(LLaMAMLP):
-    """ "Implementation of the forward pass of LLaMAMLP network."""
+    """Implementation of the forward pass of LLaMAMLP network."""
 
     def __init__(self, config: Config) -> None:
         super().__init__(config)

From 927e5c77b546cd7bd4ede8deee53105d155d12d7 Mon Sep 17 00:00:00 2001
From: zeqri <137073825+zeqri@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:19:25 +0100
Subject: [PATCH 05/11] Update whittle/models/gpt/extract.py

Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com>
---
 whittle/models/gpt/extract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/models/gpt/extract.py b/whittle/models/gpt/extract.py
index b99405f8..401484b8 100644
--- a/whittle/models/gpt/extract.py
+++ b/whittle/models/gpt/extract.py
@@ -24,7 +24,7 @@ def extract_sub_network(model: GPT, sub_network_config: Config) -> GPT:
                                      and number of layers.
 
     Returns:
-        GPT: A new sub-network model instance, initialized with parameters extracted from the original model.
+        A new sub-network model instance, initialized with parameters extracted from the original model.
     """
 
     sub_network = GPT(sub_network_config)

From 60938ac6334bfa661a73d7167aaca0d761392f06 Mon Sep 17 00:00:00 2001
From: zeqri <137073825+zeqri@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:20:05 +0100
Subject: [PATCH 06/11] Update whittle/sampling/random_sampler.py

Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com>
---
 whittle/sampling/random_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py
index cce8b840..5267c6ba 100644
--- a/whittle/sampling/random_sampler.py
+++ b/whittle/sampling/random_sampler.py
@@ -38,7 +38,7 @@ def get_smallest_sub_network(self) -> dict[str, Any]:
         Gets the smallest sub-network configuration from the search space.
 
         Returns:
-            dict: The smallest sub-network configuration.
+            The smallest sub-network configuration.
         """
         config = {}
         for k, v in self.config_space.items():

From 430f74f6a101d89df1686d7ea72c4da5d2e1a5f7 Mon Sep 17 00:00:00 2001
From: zeqri <137073825+zeqri@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:20:36 +0100
Subject: [PATCH 07/11] Update whittle/sampling/random_sampler.py

Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com>
---
 whittle/sampling/random_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py
index 5267c6ba..7f510bb0 100644
--- a/whittle/sampling/random_sampler.py
+++ b/whittle/sampling/random_sampler.py
@@ -60,7 +60,7 @@ def get_largest_sub_network(self) -> dict[str, Any]:
         gets the largest sub-network configuration from the search space.
 
         Returns:
-            dict: The largest sub-network configuration.
+            The largest sub-network configuration.
         """
 
         config = {}

From a532da3cb4361acd634acd9014cac8fd6755a132 Mon Sep 17 00:00:00 2001
From: zeqri <137073825+zeqri@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:21:02 +0100
Subject: [PATCH 08/11] Update whittle/sampling/random_sampler.py

Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com>
---
 whittle/sampling/random_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py
index 7f510bb0..39021278 100644
--- a/whittle/sampling/random_sampler.py
+++ b/whittle/sampling/random_sampler.py
@@ -25,7 +25,7 @@ def sample(self) -> dict[str, Any]:
         Gets the smallest sub-network configuration from the search space.
 
         Returns:
-            dict: The smallest sub-network configuration.
+            The smallest sub-network configuration.
         """
         config = {}
         for hp_name, hparam in self.config_space.items():

From 6b4f28f126db621225998087225872f355fccaf6 Mon Sep 17 00:00:00 2001
From: zeqri <137073825+zeqri@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:21:40 +0100
Subject: [PATCH 09/11] Update whittle/search/search.py

Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com>
---
 whittle/search/search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/search/search.py b/whittle/search/search.py
index 1d8df89c..c646c84c 100644
--- a/whittle/search/search.py
+++ b/whittle/search/search.py
@@ -34,7 +34,7 @@ def multi_objective_search(
             Defaults to None.
 
     Returns:
-        dict: The results of the search, including Pareto-optimal solutions.
+        The results of the search, including Pareto-optimal solutions.
 
     """
     metrics = ["objective_1", "objective_2"]

From d5c65415d1007944d72274f308732d44cc907988 Mon Sep 17 00:00:00 2001
From: zeqri <137073825+zeqri@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:21:53 +0100
Subject: [PATCH 10/11] Update whittle/search/search.py

Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com>
---
 whittle/search/search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whittle/search/search.py b/whittle/search/search.py
index c646c84c..552b50ff 100644
--- a/whittle/search/search.py
+++ b/whittle/search/search.py
@@ -22,7 +22,7 @@ def multi_objective_search(
     Search for the Pareto-optimal sub-networks using the specified strategy.
 
     Args:
-        objective (callable): The objective function to optimize.
+        objective: The objective function to optimize.
         search_space: The search space for the optimization.
         search_strategy: The search strategy to use.
             Defaults to "random_search".

From c3a8ac6bdc2497a184df72acfb22457997513123 Mon Sep 17 00:00:00 2001
From: mohsenal-zeqri <mmalzzaqri15@eng.just.edu.jo>
Date: Thu, 28 Nov 2024 16:15:44 +0100
Subject: [PATCH 11/11] enhance: minor changes to docstrings

---
 .../models/gpt/blocks/causal_self_attention.py | 12 ++++++++++--
 whittle/models/gpt/blocks/mlp.py               | 18 +++++++++++++++---
 whittle/models/gpt/extract.py                  |  3 +--
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/whittle/models/gpt/blocks/causal_self_attention.py b/whittle/models/gpt/blocks/causal_self_attention.py
index 272201eb..c0f277c2 100644
--- a/whittle/models/gpt/blocks/causal_self_attention.py
+++ b/whittle/models/gpt/blocks/causal_self_attention.py
@@ -50,7 +50,15 @@ def set_sub_network(
         sub_network_query_groups: int,
         sub_network_head_size: int,
     ):
-        """Sets the CausalSelfAttention block to the specified sub-network dimensionality."""
+        """
+        Sets the CausalSelfAttention block to the specified sub-network dimensionality.
+
+        Args:
+            sub_network_n_embd: Embedding dimension of the sub-network
+            sub_network_n_head: Number of attention heads in the sub-network
+            sub_network_query_groups: Number of query groups for grouped-query attention (GQA).
+            sub_network_head_size: Size of each attention head in the sub-network.
+        """
         self.sub_network_n_embd = sub_network_n_embd
         self.sub_network_n_head = sub_network_n_head
         self.sub_network_query_groups = sub_network_query_groups
@@ -76,7 +84,7 @@ def set_sub_network(
             self.sub_attention_scaler = self.config.attention_scores_scalar
 
     def reset_super_network(self):
-        """Resets the dimensionality of the current to the super-network dimensionality."""
+        """Resets the dimensionality of the current sub-network to the super-network dimensionality."""
         self.sub_network_n_embd = self.config.n_embd
         self.sub_network_n_head = self.config.n_head
         self.sub_network_head_size = self.config.head_size
diff --git a/whittle/models/gpt/blocks/mlp.py b/whittle/models/gpt/blocks/mlp.py
index ab4e74c5..215eb62f 100644
--- a/whittle/models/gpt/blocks/mlp.py
+++ b/whittle/models/gpt/blocks/mlp.py
@@ -26,7 +26,13 @@ def __init__(self, config: Config) -> None:
     def set_sub_network(
         self, sub_network_n_embd: int, sub_network_intermediate_size: int
     ):
-        """Set the input dimensionality of the current sub-network."""
+        """
+        Sets the dimensionality of the current sub-network MLP layers.
+
+        Args:
+           sub_network_n_embd: Input and output embedding dimension of the sub-network.
+           sub_network_intermediate_size: Hidden layer dimension of the sub-network MLP.
+        """
         self.sub_network_n_embd = sub_network_n_embd
         self.sub_network_intermediate_size = sub_network_intermediate_size
 
@@ -38,7 +44,7 @@ def set_sub_network(
         )
 
     def reset_super_network(self):
-        """Reset the input dimensionality of the current sub-network to the super-network dimensionality."""
+        """Resets the MLP dimensions to the original super-network dimensionality."""
         self.sub_network_n_embd = self.in_features
         self.sub_network_intermediate_size = self.intermediate_size
 
@@ -63,7 +69,13 @@ def __init__(self, config: Config) -> None:
     def set_sub_network(
         self, sub_network_n_embd: int, sub_network_intermediate_size: int
     ):
-        """Set the input dimensionality of the current sub-network."""
+        """
+        Sets the dimensionality of the current sub-network MLP layers.
+
+        Args:
+            sub_network_n_embd: Input and output embedding dimension of the sub-network.
+            sub_network_intermediate_size: Hidden layer dimension of the sub-network MLP.
+        """
         self.sub_network_n_embd = sub_network_n_embd
         self.sub_network_intermediate_size = sub_network_intermediate_size
 
diff --git a/whittle/models/gpt/extract.py b/whittle/models/gpt/extract.py
index 401484b8..ec5f14a6 100644
--- a/whittle/models/gpt/extract.py
+++ b/whittle/models/gpt/extract.py
@@ -20,8 +20,7 @@ def extract_sub_network(model: GPT, sub_network_config: Config) -> GPT:
     Args:
         model: The original, full GPT model from which the sub-network is extracted.
         sub_network_config: Configuration object for the sub-network, containing the necessary
-                                     architecture specifications such as embedding size, number of heads,
-                                     and number of layers.
+            architecture specifications such as embedding size, number of heads, and number of layers.
 
     Returns:
         A new sub-network model instance, initialized with parameters extracted from the original model.