From b9029c5c8a5162ce4ee5db59d3247c075c383f97 Mon Sep 17 00:00:00 2001 From: mohsenal-zeqri Date: Sat, 16 Nov 2024 15:31:25 +0100 Subject: [PATCH 01/11] fix: addition and standardization of docstrings --- CONTRIBUTING.md | 14 +++++++++++ whittle/models/gpt/extract.py | 18 +++++++++++++- whittle/models/gpt/utils.py | 3 ++- whittle/modules/embedding.py | 4 ++++ whittle/modules/layernorm.py | 4 ++++ whittle/modules/linear.py | 5 +++- whittle/sampling/random_sampler.py | 35 +++++++++++++++++++++++++--- whittle/search/ask_tell_scheduler.py | 15 ++++++++---- whittle/search/local_search.py | 28 +++++++++++----------- whittle/search/multi_objective.py | 11 ++++++--- whittle/search/search.py | 30 +++++++++++++++--------- 11 files changed, 128 insertions(+), 39 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e9afa97a..8e55017c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,6 +21,20 @@ pre-commit install pytest ``` +## Docstring Writing Guidelines + +When adding or updating functions or classes, please ensure that each has a docstring that follows this format: + +- **Summary**: A brief description of what the function or class does. +- **args**: List each argument with its name, and a short description of its purpose. +- **return**: Describe the return value, including what it represents. +**Note**: After adding or updating the docstring, ensure that the code passes the following command with **no warnings**: + +```bash +mkdocs build --clean --strict +``` + + ## Conventional commits and Commitizen We use [commitizen](https://commitizen-tools.github.io/commitizen/) to manage commits. diff --git a/whittle/models/gpt/extract.py b/whittle/models/gpt/extract.py index 04d37407..b99405f8 100644 --- a/whittle/models/gpt/extract.py +++ b/whittle/models/gpt/extract.py @@ -1,5 +1,6 @@ from __future__ import annotations + import torch.nn as nn from collections import OrderedDict @@ -8,9 +9,24 @@ from whittle.models.gpt.blocks.mlp import GptNeoxMLP, LLaMAMLP from whittle.modules.layernorm import LayerNorm from whittle.modules.rmsnorm import RMSNorm +from litgpt import Config + + +def extract_sub_network(model: GPT, sub_network_config: Config) -> GPT: + """ + Extracts a sub-network from a given model based on the specified sub-network configuration. + Copies relevant layers, weights, and configurations from the full model into a sub-network model. + + Args: + model: The original, full GPT model from which the sub-network is extracted. + sub_network_config: Configuration object for the sub-network, containing the necessary + architecture specifications such as embedding size, number of heads, + and number of layers. + Returns: + GPT: A new sub-network model instance, initialized with parameters extracted from the original model. + """ -def extract_sub_network(model, sub_network_config): sub_network = GPT(sub_network_config) state_dict = extract_linear(model.lm_head) diff --git a/whittle/models/gpt/utils.py b/whittle/models/gpt/utils.py index ef3ddde2..2cf2ea86 100644 --- a/whittle/models/gpt/utils.py +++ b/whittle/models/gpt/utils.py @@ -324,7 +324,8 @@ def map_old_state_dict_weights(state_dict: dict, mapping: Mapping, prefix: str) def get_default_supported_precision(training: bool) -> str: - """Return default precision that is supported by the hardware: either `bf16` or `16`. + """ + Return default precision that is supported by the hardware: either `bf16` or `16`. Args: training: `-mixed` or `-true` version of the precision to use diff --git a/whittle/modules/embedding.py b/whittle/modules/embedding.py index 94045336..abe5ab45 100644 --- a/whittle/modules/embedding.py +++ b/whittle/modules/embedding.py @@ -6,6 +6,8 @@ class Embedding(torch.nn.Embedding): + """An extension of PyTorch's `torch.nn.Embedding` with support of sub-network dimensionality.""" + def __init__( self, num_embeddings: int, @@ -34,9 +36,11 @@ def __init__( self.sub_network_embedding_dim: int | None = embedding_dim def set_sub_network(self, sub_network_embedding_dim: int): + """Set the embedding dimensionality of the current sub-network.""" self.sub_network_embedding_dim = sub_network_embedding_dim def reset_super_network(self): + """Reset the embedding dimensionality of the current sub-network to the original value.""" self.sub_network_embedding_dim = self.embedding_dim def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/whittle/modules/layernorm.py b/whittle/modules/layernorm.py index cc1bc7d3..0c5c1766 100644 --- a/whittle/modules/layernorm.py +++ b/whittle/modules/layernorm.py @@ -5,6 +5,8 @@ class LayerNorm(torch.nn.LayerNorm): + """An extension of PyTorch's `torch.nn.LayerNorm` with support of sub-network dimensionality.""" + def __init__(self, in_features: int, eps: float = 1e-5): super().__init__(in_features, eps) self.in_features = in_features @@ -13,9 +15,11 @@ def __init__(self, in_features: int, eps: float = 1e-5): self.sub_network_in_features = self.in_features def set_sub_network(self, sub_network_in_features: int): + """Set the input dimensionality of the current sub-network.""" self.sub_network_in_features = sub_network_in_features def reset_super_network(self): + """Reset the input dimensionality of the current sub-network to the original value.""" self.sub_network_in_features = self.in_features def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/whittle/modules/linear.py b/whittle/modules/linear.py index 5bbb5c74..e36f5594 100644 --- a/whittle/modules/linear.py +++ b/whittle/modules/linear.py @@ -6,6 +6,8 @@ class Linear(nn.Linear): + """An extension of PyTorch's `torch.nn.Linear` with support of sub-network dimensionality.""" + def __init__( self, in_features: int, @@ -14,7 +16,6 @@ def __init__( device=None, dtype=None, ): - """ """ super().__init__(in_features, out_features, bias, device, dtype) # Set the current sub-network dimensions equal to super-network @@ -25,10 +26,12 @@ def __init__( def set_sub_network( self, sub_network_in_features: int, sub_network_out_features: int ): + """Set the linear transformation dimensions of the current sub-network.""" self.sub_network_in_features = sub_network_in_features self.sub_network_out_features = sub_network_out_features def reset_super_network(self): + """Reset the linear transformation dimensions of the current sub-network to the original value.""" self.sub_network_in_features = self.in_features self.sub_network_out_features = self.out_features diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py index c569493e..1fd051cc 100644 --- a/whittle/sampling/random_sampler.py +++ b/whittle/sampling/random_sampler.py @@ -1,23 +1,45 @@ from __future__ import annotations import warnings +from typing import Any + import numpy as np from syne_tune.config_space import Categorical, Domain class RandomSampler: + """ + RandomSampler samples configurations from a given configuration space using a random state. + + Args: + config_space: The configuration space from which to sample. + seed: Seed for the random number generator. Defaults to None. + """ + def __init__(self, config_space: dict, seed: int | None = None): self.config_space = config_space self.rng = np.random.RandomState(seed) - def sample(self): + def sample(self) -> dict[str, Any]: + """ + Gets the smallest sub-network configuration from the configuration space. + + Returns: + dict: The smallest sub-network configuration. + """ config = {} for hp_name, hparam in self.config_space.items(): if isinstance(hparam, Domain): config[hp_name] = hparam.sample(random_state=self.rng) return config - def get_smallest_sub_network(self): + def get_smallest_sub_network(self) -> dict[str, Any]: + """ + Gets the smallest sub-network configuration from the configuration space. + + Returns: + dict: The smallest sub-network configuration. + """ config = {} for k, v in self.config_space.items(): if isinstance(v, Domain): @@ -33,7 +55,14 @@ def get_smallest_sub_network(self): config[k] = v.lower return config - def get_largest_sub_network(self): + def get_largest_sub_network(self) -> dict[str, Any]: + """ + gets the largest sub-network configuration from the configuration space. + + Returns: + dict: The largest sub-network configuration. + """ + config = {} for k, v in self.config_space.items(): if isinstance(v, Domain): diff --git a/whittle/search/ask_tell_scheduler.py b/whittle/search/ask_tell_scheduler.py index 64777b70..848dc8b5 100644 --- a/whittle/search/ask_tell_scheduler.py +++ b/whittle/search/ask_tell_scheduler.py @@ -19,7 +19,9 @@ def __init__(self, base_scheduler: TrialScheduler): def ask(self) -> Trial: """ Ask the scheduler for new trial to run - :return: Trial to run + + Returns: + Trial to run """ trial_suggestion = self.bscheduler.suggest(self.trial_counter) trial = Trial( @@ -32,10 +34,12 @@ def ask(self) -> Trial: def tell(self, trial: Trial, experiment_result: dict[str, float]): """ - Feed experiment results back to the Scheduler + Feed experiment results back to the Scheduler. + + Args: + trial: Trial that was run. + experiment_result: {metric: value} dictionary with experiment results. - :param trial: Trial that was run - :param experiment_result: {metric: value} dictionary with experiment results """ trial_result = trial.add_results( metrics=experiment_result, @@ -47,7 +51,8 @@ def tell(self, trial: Trial, experiment_result: dict[str, float]): def best_trial(self, metris: str) -> TrialResult: """ - Return the best trial according to the provided metric + Returns: + the best trial according to the provided metric. """ if self.bscheduler.mode == "max": sign = 1.0 diff --git a/whittle/search/local_search.py b/whittle/search/local_search.py index a3e479f5..a82d5acc 100644 --- a/whittle/search/local_search.py +++ b/whittle/search/local_search.py @@ -27,21 +27,21 @@ class PopulationElement: class LS(FIFOScheduler): """ - See :class:`~syne_tune.optimizer.schedulers.searchers.RandomSearcher` - for ``kwargs["search_options"]`` parameters. - - :param config_space: Configuration space for evaluation function - :param metric: Name of metric to optimize - :param population_size: See - :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`. - Defaults to 100 - :param sample_size: See - :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`. - Defaults to 10 - :param random_seed: Random seed, optional - :param kwargs: Additional arguments to - :class:`~syne_tune.optimizer.schedulers.FIFOScheduler` + for ``kwargs["search_options"]`` parameters. + + Args: + config_space: Configuration space for evaluation function + metric: Name of metric to optimize + population_size: See + :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`. + Defaults to 100 + sample_size: See + :class:`~syne_tune.optimizer.schedulers.searchers.RegularizedEvolution`. + Defaults to 10 + random_seed: Random seed, optional. + kwargs: Additional arguments to + :class:`~syne_tune.optimizer.schedulers.FIFOScheduler` """ def __init__( diff --git a/whittle/search/multi_objective.py b/whittle/search/multi_objective.py index bae442a3..a5eac5e7 100644 --- a/whittle/search/multi_objective.py +++ b/whittle/search/multi_objective.py @@ -5,10 +5,15 @@ def get_pareto_optimal(costs: np.ndarray) -> npt.NDArray[np.bool_]: - """Find the pareto-optimal point. + """ + Find the pareto-optimal point. + + Args: + costs: (n_points, m_cost_values) array + + Returns: + (n_points, 1) indicator if point is on pareto front or not. - :param costs: (n_points, m_cost_values) array - :return: (n_points, 1) indicator if point is on pareto front or not. """ assert isinstance(costs, np.ndarray) assert costs.ndim == 2 diff --git a/whittle/search/search.py b/whittle/search/search.py index e9fe095d..1d8df89c 100644 --- a/whittle/search/search.py +++ b/whittle/search/search.py @@ -1,6 +1,7 @@ from __future__ import annotations import time +from typing import Any import numpy as np @@ -16,19 +17,26 @@ def multi_objective_search( num_samples: int = 100, objective_kwargs: dict | None = None, seed: int | None = None, -): - """ - Search for the Pareto optimal sub-networks. - - :param objective: the objective function to optimize. - :param search_space: the search space. - :param search_strategy: the search strategy. - :param objective_kwargs: the keyword arguments for the objective function. - :param num_samples: the number of samples to take. - :param seed: the random seed. - :return: the results of the search. +) -> dict[str, Any]: """ + Search for the Pareto-optimal sub-networks using the specified strategy. + + Args: + objective (callable): The objective function to optimize. + search_space: The search space for the optimization. + search_strategy: The search strategy to use. + Defaults to "random_search". + num_samples: The number of samples to evaluate. + Defaults to 100. + objective_kwargs: Keyword arguments for the objective function. + Defaults to None. + seed: The random seed for reproducibility. + Defaults to None. + Returns: + dict: The results of the search, including Pareto-optimal solutions. + + """ metrics = ["objective_1", "objective_2"] if seed is None: seed = np.random.randint(0, 1000000) From ff4144e755c18e386b3b19874a14eb192f6108c2 Mon Sep 17 00:00:00 2001 From: mohsenal-zeqri Date: Sun, 17 Nov 2024 17:36:08 +0100 Subject: [PATCH 02/11] enhance: modified and added more docstrings --- whittle/models/gpt/blocks/causal_self_attention.py | 4 ++++ whittle/models/gpt/blocks/mlp.py | 10 ++++++++++ whittle/modules/embedding.py | 4 ++-- whittle/modules/layernorm.py | 4 ++-- whittle/modules/linear.py | 4 ++-- whittle/sampling/random_sampler.py | 10 +++++----- 6 files changed, 25 insertions(+), 11 deletions(-) diff --git a/whittle/models/gpt/blocks/causal_self_attention.py b/whittle/models/gpt/blocks/causal_self_attention.py index 07420ec8..272201eb 100644 --- a/whittle/models/gpt/blocks/causal_self_attention.py +++ b/whittle/models/gpt/blocks/causal_self_attention.py @@ -11,6 +11,8 @@ class CausalSelfAttention(nn.Module): + """Extension of litgpt's `litgpt.model.CausalSelfAttention` with support to adapt to sub-network dimensionality.""" + def __init__(self, config: Config, block_idx: int) -> None: super().__init__() shape = (config.n_head + 2 * config.n_query_groups) * config.head_size @@ -48,6 +50,7 @@ def set_sub_network( sub_network_query_groups: int, sub_network_head_size: int, ): + """Sets the CausalSelfAttention block to the specified sub-network dimensionality.""" self.sub_network_n_embd = sub_network_n_embd self.sub_network_n_head = sub_network_n_head self.sub_network_query_groups = sub_network_query_groups @@ -73,6 +76,7 @@ def set_sub_network( self.sub_attention_scaler = self.config.attention_scores_scalar def reset_super_network(self): + """Resets the dimensionality of the current to the super-network dimensionality.""" self.sub_network_n_embd = self.config.n_embd self.sub_network_n_head = self.config.n_head self.sub_network_head_size = self.config.head_size diff --git a/whittle/models/gpt/blocks/mlp.py b/whittle/models/gpt/blocks/mlp.py index c87b9744..0618d215 100644 --- a/whittle/models/gpt/blocks/mlp.py +++ b/whittle/models/gpt/blocks/mlp.py @@ -9,6 +9,8 @@ class GptNeoxMLP(litgpt.model.GptNeoxMLP): + """An extension of litgp's `litgpt.model.GptNeoxMLP` with support to adapt to sub-network dimensionality.""" + def __init__(self, config: Config) -> None: super().__init__(config) self.fc = Linear(config.n_embd, config.intermediate_size, bias=config.bias) @@ -24,6 +26,7 @@ def __init__(self, config: Config) -> None: def set_sub_network( self, sub_network_n_embd: int, sub_network_intermediate_size: int ): + """Set the input dimensionality of the current sub-network.""" self.sub_network_n_embd = sub_network_n_embd self.sub_network_intermediate_size = sub_network_intermediate_size @@ -35,6 +38,7 @@ def set_sub_network( ) def reset_super_network(self): + """Reset the input dimensionality of the current sub-network to the super-network dimensionality.""" self.sub_network_n_embd = self.in_features self.sub_network_intermediate_size = self.intermediate_size @@ -43,6 +47,8 @@ def reset_super_network(self): class LLaMAMLP(litgpt.model.LLaMAMLP): + """An extension of litgp's `litgpt.model.LLaMAMLP` with support to adapt to sub-network dimensionality.""" + def __init__(self, config: Config) -> None: super().__init__(config) self.fc_1 = Linear(config.n_embd, config.intermediate_size, bias=config.bias) @@ -57,6 +63,7 @@ def __init__(self, config: Config) -> None: def set_sub_network( self, sub_network_n_embd: int, sub_network_intermediate_size: int ): + """Set the input dimensionality of the current sub-network.""" self.sub_network_n_embd = sub_network_n_embd self.sub_network_intermediate_size = sub_network_intermediate_size @@ -71,6 +78,7 @@ def set_sub_network( ) def reset_super_network(self): + """Reset the input dimensionality of the current sub-network to the super-network dimensionality.""" self.sub_network_n_embd = self.in_features self.sub_network_intermediate_size = self.intermediate_size @@ -80,6 +88,8 @@ def reset_super_network(self): class GemmaMLP(LLaMAMLP): + """ "Implementation of the forward pass of LLaMAMLP network.""" + def __init__(self, config: Config) -> None: super().__init__(config) diff --git a/whittle/modules/embedding.py b/whittle/modules/embedding.py index abe5ab45..771ecbee 100644 --- a/whittle/modules/embedding.py +++ b/whittle/modules/embedding.py @@ -6,7 +6,7 @@ class Embedding(torch.nn.Embedding): - """An extension of PyTorch's `torch.nn.Embedding` with support of sub-network dimensionality.""" + "An extension of PyTorch's torch.nn.Embedding with support to sub-sample weights corresponding to the sub-network dimensionality" def __init__( self, @@ -40,7 +40,7 @@ def set_sub_network(self, sub_network_embedding_dim: int): self.sub_network_embedding_dim = sub_network_embedding_dim def reset_super_network(self): - """Reset the embedding dimensionality of the current sub-network to the original value.""" + """Reset the embedding dimensionality of the current sub-network to the super-network dimensionality""" self.sub_network_embedding_dim = self.embedding_dim def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/whittle/modules/layernorm.py b/whittle/modules/layernorm.py index 0c5c1766..bcc833fe 100644 --- a/whittle/modules/layernorm.py +++ b/whittle/modules/layernorm.py @@ -5,7 +5,7 @@ class LayerNorm(torch.nn.LayerNorm): - """An extension of PyTorch's `torch.nn.LayerNorm` with support of sub-network dimensionality.""" + """An extension of PyTorch's `torch.nn.LayerNorm` with support with support to sub-sample weights corresponding to the sub-network dimensionality.""" def __init__(self, in_features: int, eps: float = 1e-5): super().__init__(in_features, eps) @@ -19,7 +19,7 @@ def set_sub_network(self, sub_network_in_features: int): self.sub_network_in_features = sub_network_in_features def reset_super_network(self): - """Reset the input dimensionality of the current sub-network to the original value.""" + """Reset the input dimensionality of the current sub-network to the super-network dimensionality.""" self.sub_network_in_features = self.in_features def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/whittle/modules/linear.py b/whittle/modules/linear.py index e36f5594..08db0bdd 100644 --- a/whittle/modules/linear.py +++ b/whittle/modules/linear.py @@ -6,7 +6,7 @@ class Linear(nn.Linear): - """An extension of PyTorch's `torch.nn.Linear` with support of sub-network dimensionality.""" + """An extension of PyTorch's `torch.nn.Linear` with support to sub-sample weights corresponding to the sub-network dimensionality""" def __init__( self, @@ -31,7 +31,7 @@ def set_sub_network( self.sub_network_out_features = sub_network_out_features def reset_super_network(self): - """Reset the linear transformation dimensions of the current sub-network to the original value.""" + """Reset the linear transformation dimensions of the current sub-network to the super-network dimensionality.""" self.sub_network_in_features = self.in_features self.sub_network_out_features = self.out_features diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py index 1fd051cc..cce8b840 100644 --- a/whittle/sampling/random_sampler.py +++ b/whittle/sampling/random_sampler.py @@ -9,10 +9,10 @@ class RandomSampler: """ - RandomSampler samples configurations from a given configuration space using a random state. + RandomSampler samples configurations from a given search space using a random state. Args: - config_space: The configuration space from which to sample. + config_space: The search space from which to sample. seed: Seed for the random number generator. Defaults to None. """ @@ -22,7 +22,7 @@ def __init__(self, config_space: dict, seed: int | None = None): def sample(self) -> dict[str, Any]: """ - Gets the smallest sub-network configuration from the configuration space. + Gets the smallest sub-network configuration from the search space. Returns: dict: The smallest sub-network configuration. @@ -35,7 +35,7 @@ def sample(self) -> dict[str, Any]: def get_smallest_sub_network(self) -> dict[str, Any]: """ - Gets the smallest sub-network configuration from the configuration space. + Gets the smallest sub-network configuration from the search space. Returns: dict: The smallest sub-network configuration. @@ -57,7 +57,7 @@ def get_smallest_sub_network(self) -> dict[str, Any]: def get_largest_sub_network(self) -> dict[str, Any]: """ - gets the largest sub-network configuration from the configuration space. + gets the largest sub-network configuration from the search space. Returns: dict: The largest sub-network configuration. From 1754bdddbda546490cb2ac0f2a8aaac6352e05ec Mon Sep 17 00:00:00 2001 From: mohsenal-zeqri Date: Mon, 18 Nov 2024 15:46:22 +0100 Subject: [PATCH 03/11] fix: modified linear.py docstring --- whittle/modules/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/modules/linear.py b/whittle/modules/linear.py index 08db0bdd..873ccbb4 100644 --- a/whittle/modules/linear.py +++ b/whittle/modules/linear.py @@ -6,7 +6,7 @@ class Linear(nn.Linear): - """An extension of PyTorch's `torch.nn.Linear` with support to sub-sample weights corresponding to the sub-network dimensionality""" + """An extension of PyTorch's torch.nn.Linear with flexible input and output dimensionality corresponding to sub-network""" def __init__( self, From a26bda032d790b8fc625c57b06f7da29b5c38639 Mon Sep 17 00:00:00 2001 From: zeqri <137073825+zeqri@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:19:13 +0100 Subject: [PATCH 04/11] Update whittle/models/gpt/blocks/mlp.py Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com> --- whittle/models/gpt/blocks/mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/models/gpt/blocks/mlp.py b/whittle/models/gpt/blocks/mlp.py index 0618d215..ab4e74c5 100644 --- a/whittle/models/gpt/blocks/mlp.py +++ b/whittle/models/gpt/blocks/mlp.py @@ -88,7 +88,7 @@ def reset_super_network(self): class GemmaMLP(LLaMAMLP): - """ "Implementation of the forward pass of LLaMAMLP network.""" + """Implementation of the forward pass of LLaMAMLP network.""" def __init__(self, config: Config) -> None: super().__init__(config) From 927e5c77b546cd7bd4ede8deee53105d155d12d7 Mon Sep 17 00:00:00 2001 From: zeqri <137073825+zeqri@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:19:25 +0100 Subject: [PATCH 05/11] Update whittle/models/gpt/extract.py Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com> --- whittle/models/gpt/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/models/gpt/extract.py b/whittle/models/gpt/extract.py index b99405f8..401484b8 100644 --- a/whittle/models/gpt/extract.py +++ b/whittle/models/gpt/extract.py @@ -24,7 +24,7 @@ def extract_sub_network(model: GPT, sub_network_config: Config) -> GPT: and number of layers. Returns: - GPT: A new sub-network model instance, initialized with parameters extracted from the original model. + A new sub-network model instance, initialized with parameters extracted from the original model. """ sub_network = GPT(sub_network_config) From 60938ac6334bfa661a73d7167aaca0d761392f06 Mon Sep 17 00:00:00 2001 From: zeqri <137073825+zeqri@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:20:05 +0100 Subject: [PATCH 06/11] Update whittle/sampling/random_sampler.py Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com> --- whittle/sampling/random_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py index cce8b840..5267c6ba 100644 --- a/whittle/sampling/random_sampler.py +++ b/whittle/sampling/random_sampler.py @@ -38,7 +38,7 @@ def get_smallest_sub_network(self) -> dict[str, Any]: Gets the smallest sub-network configuration from the search space. Returns: - dict: The smallest sub-network configuration. + The smallest sub-network configuration. """ config = {} for k, v in self.config_space.items(): From 430f74f6a101d89df1686d7ea72c4da5d2e1a5f7 Mon Sep 17 00:00:00 2001 From: zeqri <137073825+zeqri@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:20:36 +0100 Subject: [PATCH 07/11] Update whittle/sampling/random_sampler.py Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com> --- whittle/sampling/random_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py index 5267c6ba..7f510bb0 100644 --- a/whittle/sampling/random_sampler.py +++ b/whittle/sampling/random_sampler.py @@ -60,7 +60,7 @@ def get_largest_sub_network(self) -> dict[str, Any]: gets the largest sub-network configuration from the search space. Returns: - dict: The largest sub-network configuration. + The largest sub-network configuration. """ config = {} From a532da3cb4361acd634acd9014cac8fd6755a132 Mon Sep 17 00:00:00 2001 From: zeqri <137073825+zeqri@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:21:02 +0100 Subject: [PATCH 08/11] Update whittle/sampling/random_sampler.py Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com> --- whittle/sampling/random_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/sampling/random_sampler.py b/whittle/sampling/random_sampler.py index 7f510bb0..39021278 100644 --- a/whittle/sampling/random_sampler.py +++ b/whittle/sampling/random_sampler.py @@ -25,7 +25,7 @@ def sample(self) -> dict[str, Any]: Gets the smallest sub-network configuration from the search space. Returns: - dict: The smallest sub-network configuration. + The smallest sub-network configuration. """ config = {} for hp_name, hparam in self.config_space.items(): From 6b4f28f126db621225998087225872f355fccaf6 Mon Sep 17 00:00:00 2001 From: zeqri <137073825+zeqri@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:21:40 +0100 Subject: [PATCH 09/11] Update whittle/search/search.py Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com> --- whittle/search/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/search/search.py b/whittle/search/search.py index 1d8df89c..c646c84c 100644 --- a/whittle/search/search.py +++ b/whittle/search/search.py @@ -34,7 +34,7 @@ def multi_objective_search( Defaults to None. Returns: - dict: The results of the search, including Pareto-optimal solutions. + The results of the search, including Pareto-optimal solutions. """ metrics = ["objective_1", "objective_2"] From d5c65415d1007944d72274f308732d44cc907988 Mon Sep 17 00:00:00 2001 From: zeqri <137073825+zeqri@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:21:53 +0100 Subject: [PATCH 10/11] Update whittle/search/search.py Co-authored-by: Timur M. Carstensen <40788422+timurcarstensen@users.noreply.github.com> --- whittle/search/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whittle/search/search.py b/whittle/search/search.py index c646c84c..552b50ff 100644 --- a/whittle/search/search.py +++ b/whittle/search/search.py @@ -22,7 +22,7 @@ def multi_objective_search( Search for the Pareto-optimal sub-networks using the specified strategy. Args: - objective (callable): The objective function to optimize. + objective: The objective function to optimize. search_space: The search space for the optimization. search_strategy: The search strategy to use. Defaults to "random_search". From c3a8ac6bdc2497a184df72acfb22457997513123 Mon Sep 17 00:00:00 2001 From: mohsenal-zeqri Date: Thu, 28 Nov 2024 16:15:44 +0100 Subject: [PATCH 11/11] enhance: minor changes to docstrings --- .../models/gpt/blocks/causal_self_attention.py | 12 ++++++++++-- whittle/models/gpt/blocks/mlp.py | 18 +++++++++++++++--- whittle/models/gpt/extract.py | 3 +-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/whittle/models/gpt/blocks/causal_self_attention.py b/whittle/models/gpt/blocks/causal_self_attention.py index 272201eb..c0f277c2 100644 --- a/whittle/models/gpt/blocks/causal_self_attention.py +++ b/whittle/models/gpt/blocks/causal_self_attention.py @@ -50,7 +50,15 @@ def set_sub_network( sub_network_query_groups: int, sub_network_head_size: int, ): - """Sets the CausalSelfAttention block to the specified sub-network dimensionality.""" + """ + Sets the CausalSelfAttention block to the specified sub-network dimensionality. + + Args: + sub_network_n_embd: Embedding dimension of the sub-network + sub_network_n_head: Number of attention heads in the sub-network + sub_network_query_groups: Number of query groups for grouped-query attention (GQA). + sub_network_head_size: Size of each attention head in the sub-network. + """ self.sub_network_n_embd = sub_network_n_embd self.sub_network_n_head = sub_network_n_head self.sub_network_query_groups = sub_network_query_groups @@ -76,7 +84,7 @@ def set_sub_network( self.sub_attention_scaler = self.config.attention_scores_scalar def reset_super_network(self): - """Resets the dimensionality of the current to the super-network dimensionality.""" + """Resets the dimensionality of the current sub-network to the super-network dimensionality.""" self.sub_network_n_embd = self.config.n_embd self.sub_network_n_head = self.config.n_head self.sub_network_head_size = self.config.head_size diff --git a/whittle/models/gpt/blocks/mlp.py b/whittle/models/gpt/blocks/mlp.py index ab4e74c5..215eb62f 100644 --- a/whittle/models/gpt/blocks/mlp.py +++ b/whittle/models/gpt/blocks/mlp.py @@ -26,7 +26,13 @@ def __init__(self, config: Config) -> None: def set_sub_network( self, sub_network_n_embd: int, sub_network_intermediate_size: int ): - """Set the input dimensionality of the current sub-network.""" + """ + Sets the dimensionality of the current sub-network MLP layers. + + Args: + sub_network_n_embd: Input and output embedding dimension of the sub-network. + sub_network_intermediate_size: Hidden layer dimension of the sub-network MLP. + """ self.sub_network_n_embd = sub_network_n_embd self.sub_network_intermediate_size = sub_network_intermediate_size @@ -38,7 +44,7 @@ def set_sub_network( ) def reset_super_network(self): - """Reset the input dimensionality of the current sub-network to the super-network dimensionality.""" + """Resets the MLP dimensions to the original super-network dimensionality.""" self.sub_network_n_embd = self.in_features self.sub_network_intermediate_size = self.intermediate_size @@ -63,7 +69,13 @@ def __init__(self, config: Config) -> None: def set_sub_network( self, sub_network_n_embd: int, sub_network_intermediate_size: int ): - """Set the input dimensionality of the current sub-network.""" + """ + Sets the dimensionality of the current sub-network MLP layers. + + Args: + sub_network_n_embd: Input and output embedding dimension of the sub-network. + sub_network_intermediate_size: Hidden layer dimension of the sub-network MLP. + """ self.sub_network_n_embd = sub_network_n_embd self.sub_network_intermediate_size = sub_network_intermediate_size diff --git a/whittle/models/gpt/extract.py b/whittle/models/gpt/extract.py index 401484b8..ec5f14a6 100644 --- a/whittle/models/gpt/extract.py +++ b/whittle/models/gpt/extract.py @@ -20,8 +20,7 @@ def extract_sub_network(model: GPT, sub_network_config: Config) -> GPT: Args: model: The original, full GPT model from which the sub-network is extracted. sub_network_config: Configuration object for the sub-network, containing the necessary - architecture specifications such as embedding size, number of heads, - and number of layers. + architecture specifications such as embedding size, number of heads, and number of layers. Returns: A new sub-network model instance, initialized with parameters extracted from the original model.