adap · tanertopal · Aug 21, 2023 · Aug 21, 2023
@@ -73,21 +73,34 @@
 # To prevent 404 errors and redirect to the new pages.
 redirects = {
     # Renamed pages
-    "installation": "install-flower.html",
-    "configuring-clients.html": "configure-clients.html",
+    "installation": "how-to-install-flower.html",
+    "configuring-clients.html": "how-to-configure-clients.html",
     "quickstart_mxnet": "quickstart-mxnet.html",
     "quickstart_pytorch_lightning": "quickstart-pytorch-lightning.html",
-    "example_walkthrough_pytorch_mnist": "example-walkthrough-pytorch-mnist.html",
     "quickstart_huggingface": "quickstart-huggingface.html",
     "quickstart_pytorch": "quickstart-pytorch.html",
     "quickstart_tensorflow": "quickstart-tensorflow.html",
-    "release_process": "release-process.html",
-    "saving-progress": "save-progress.html",
-    "writing-documentation": "write-documentation.html",
     "quickstart_scikitlearn": "quickstart-scikitlearn.html",
     "quickstart_xgboost": "quickstart-xgboost.html",
+    "example_walkthrough_pytorch_mnist": "example-walkthrough-pytorch-mnist.html",
+    "release_process": "release-process.html",
+    "saving-progress": "how-to-save-and-load-model-checkpoints.html",
+    "writing-documentation": "write-documentation.html",
     "apiref-binaries": "apiref-cli.html",
     "fedbn-example-pytorch-from-centralized-to-federated": "example-fedbn-pytorch-from-centralized-to-federated.html",
+
+    # Restructuring: how-to guides
+    "install-flower": "how-to-install-flower.html",
+    "configure-clients": "how-to-configure-clients.html",
+    "strategies": "how-to-use-strategies.html",
+    "implementing-strategies": "how-to-implement-strategies.html",
+    "save-progress": "how-to-save-and-load-model-checkpoints.html",
+    "saving-and-loading-pytorch-checkpoints": "how-to-save-and-load-model-checkpoints.html",
+    "monitor-simulation": "how-to-monitor-simulation.html",
+    "logging": "how-to-configure-logging.html",
+    "ssl-enabled-connections": "how-to-enable-ssl-connections.html",
+    "upgrade-to-flower-1.0": "how-to-upgrade-to-flower-1.0.html",
+
     # Deleted pages
     "people": "index.html",
     "organizations": "index.html",

@@ -1,47 +1,8 @@
-Save Progress
-=============
+Aggregate evaluation results
+============================
 
-The Flower server does not prescribe a way to persist model updates or evaluation results.
-Flower does not (yet) automatically save model updates on the server-side.
-It's on the roadmap to provide a built-in way of doing this.
+The Flower server does not prescribe a way to aggregate evaluation results, but it enables the user to fully customize result aggregation.
 
-Model Checkpointing
--------------------
-
-Model updates can be persisted on the server-side by customizing :code:`Strategy` methods.
-Implementing custom strategies is always an option, but for many cases it may be more convenient to simply customize an existing strategy.
-The following code example defines a new :code:`SaveModelStrategy` which customized the existing built-in :code:`FedAvg` strategy.
-In particular, it customizes :code:`aggregate_fit` by calling :code:`aggregate_fit` in the base class (:code:`FedAvg`).
-It then continues to save returned (aggregated) weights before it returns those aggregated weights to the caller (i.e., the server):
-
-.. code-block:: python
-
-    class SaveModelStrategy(fl.server.strategy.FedAvg):
-        def aggregate_fit(
-            self,
-            server_round: int,
-            results: List[Tuple[fl.server.client_proxy.ClientProxy, fl.common.FitRes]],
-            failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]],
-        ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]:
-
-            # Call aggregate_fit from base class (FedAvg) to aggregate parameters and metrics
-            aggregated_parameters, aggregated_metrics = super().aggregate_fit(server_round, results, failures)
-
-            if aggregated_parameters is not None:
-                # Convert `Parameters` to `List[np.ndarray]`
-                aggregated_ndarrays: List[np.ndarray] = fl.common.parameters_to_ndarrays(aggregated_parameters)
-
-                # Save aggregated_ndarrays
-                print(f"Saving round {server_round} aggregated_ndarrays...")
-                np.savez(f"round-{server_round}-weights.npz", *aggregated_ndarrays)
-
-            return aggregated_parameters, aggregated_metrics
-
-    # Create strategy and run server
-    strategy = SaveModelStrategy(
-        # (same arguments as FedAvg here)
-    )
-    fl.server.start_server(strategy=strategy)
 
 Aggregate Custom Evaluation Results
 -----------------------------------

@@ -1,4 +1,4 @@
-Configure Clients
+Configure clients
 =================
 
 Along with model parameters, Flower can send configuration values to clients. Configuration values can be used for various purposes. They are, for example, a popular way to control client-side hyperparameters from the server.

@@ -1,5 +1,5 @@
-Logging
-=============
+Configure logging
+=================
 
 The Flower logger keeps track of all core events that take place in federated learning workloads.
 It presents information by default following a standard message format:
@@ -30,7 +30,7 @@ In this way, the logger would typically display information on your terminal as
     ...
 
 
-Saving Log to File
+Saving log to file
 -------------------
 
 By default, the Flower log is outputted to the terminal where you launch your Federated Learning workload from. This applies for both gRPC-based federation (i.e. when you do :code:`fl.server.start_server`) and when using the :code:`VirtualClientEngine` (i.e. when you do :code:`fl.simulation.start_simulation`).
@@ -70,8 +70,8 @@ If we inspect we see the log above is also recorded but prefixing with :code:`id
     ...
 
 
-Logging Your Own Messages
--------------------------
+Log your own messages
+---------------------
 
 You might expand the information shown by default with the Flower logger by adding more messages relevant to your application.
 You can achieve this easily as follows.
@@ -124,8 +124,8 @@ In this way your logger will show, in addition to the default messages, the ones
     ...
 
 
-Logging to a Remote Service
----------------------------
+Log to a remote service
+-----------------------
 
 The :code:`fl.common.logger.configure` function, also allows specifying a host to which logs can be pushed (via :code:`POST`) through a native Python :code:`logging.handler.HTTPHandler`.
 This is a particularly useful feature in :code:`gRPC`-based Federated Learning workloads where otherwise gathering logs from all entities (i.e. the server and the clients) might be cumbersome.

@@ -1,5 +1,5 @@
-SSL-enabled Server and Client
-=============================
+Enable SSL connections
+======================
 
 This guide describes how to a SSL-enabled secure Flower server can be started and
 how a Flower client can establish a secure connections to it.
@@ -92,7 +92,7 @@ You should now have learned how to generate self-signed certificates using the g
 SSL-enabled server, and have a client establish a secure connection to it.
 
 
-Additional Resources
+Additional resources
 --------------------
 
 These additional sources might be relevant if you would like to dive deeper into the topic of certificates:

@@ -1,5 +1,5 @@
-Implementing Strategies
-=======================
+Implement strategies
+====================
 
 The strategy abstraction enables implementation of fully custom strategies. A
 strategy is basically the federated learning algorithm that runs on the server.

@@ -2,10 +2,10 @@ Install Flower
 ==============
 
 
-Python Version
+Python version
 --------------
 
-Flower requires `Python 3.7 <https://docs.python.org/3.7/>`_ or above.
+Flower requires at least `Python 3.7 <https://docs.python.org/3.7/>`_, but `Python 3.8 <https://docs.python.org/3.7/>`_ or above is recommended.
 
 
 Install stable release

@@ -1,12 +1,14 @@
-Monitor Simulation
+Monitor simulation
 ==================
 
 Flower allows you to monitor system resources while running your simulation. Moreover, the Flower simulation engine is powerful and enables you to decide how to allocate resources per client manner and constrain the total usage. Insights from resource consumption can help you make smarter decisions and speed up the execution time.
 
 The specific instructions assume you are using macOS and have the `Homebrew <https://brew.sh/>`_ package manager installed.
 
+
 Downloads
 ---------
+
 .. code-block:: bash
 
   brew install prometheus grafana
@@ -81,6 +83,7 @@ Your terminal editor should open and allow you to apply the following configurat
 
 Congratulations, you just downloaded all the necessary software needed for metrics tracking. Now, let’s start it.
 
+
 Tracking metrics
 ----------------
 
@@ -127,6 +130,7 @@ After you finish the visualization, stop Prometheus and Grafana. This is importa
 
 Resource allocation
 -------------------
+
 You must understand how the Ray library works to efficiently allocate system resources to simulation clients on your own.
 
 Initially, the simulation (which Ray handles under the hood) starts by default with all the available resources on the system, which it shares among the clients. It doesn't mean it divides it equally among all of them, nor that the model training happens at all of them simultaneously. You will learn more about that in the later part of this blog. You can check the system resources by running the following:
@@ -203,8 +207,10 @@ Now comes the crucial part. Ray will start a new client only when it has all the
 In the example above, only one client will be run, so your clients won't run concurrently. Setting :code:`client_num_gpus = 0.5` would allow running two clients and therefore enable them to run concurrently.
 Be careful not to require more resources than available. If you specified :code:`client_num_gpus = 2`, the simulation wouldn't start (even if you had 2 GPUs but decided to set 1 in :code:`ray_init_args`).
 
+
 FAQ
 ---
+
 Q: I don't see any metrics logged.
 
 A: The timeframe might not be properly set. The setting is in the top right corner ("Last 30 minutes" by default). Please change the timeframe to reflect the period when the simulation was running.
@@ -221,8 +227,10 @@ Q: I see "This site can't be reached" when going to `<http://127.0.0.1:8265>`_.
 
 A: Either the simulation has already finished, or you still need to start Prometheus.
 
+
 Resources
 ---------
+
 Ray Dashboard: `<https://docs.ray.io/en/latest/ray-core/ray-dashboard.html>`_
 
 Ray Metrics: `<https://docs.ray.io/en/latest/ray-observability/ray-metrics.html>`_
@@ -0,0 +1,93 @@
+Save and load model checkpoints
+===============================
+
+Flower does not automatically save model updates on the server-side. This how-to guide describes the steps to save (and load) model checkpoints in Flower.
+
+
+Model checkpointing
+-------------------
+
+Model updates can be persisted on the server-side by customizing :code:`Strategy` methods.
+Implementing custom strategies is always an option, but for many cases it may be more convenient to simply customize an existing strategy.
+The following code example defines a new :code:`SaveModelStrategy` which customized the existing built-in :code:`FedAvg` strategy.
+In particular, it customizes :code:`aggregate_fit` by calling :code:`aggregate_fit` in the base class (:code:`FedAvg`).
+It then continues to save returned (aggregated) weights before it returns those aggregated weights to the caller (i.e., the server):
+
+.. code-block:: python
+
+    class SaveModelStrategy(fl.server.strategy.FedAvg):
+        def aggregate_fit(
+            self,
+            server_round: int,
+            results: List[Tuple[fl.server.client_proxy.ClientProxy, fl.common.FitRes]],
+            failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]],
+        ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]:
+
+            # Call aggregate_fit from base class (FedAvg) to aggregate parameters and metrics
+            aggregated_parameters, aggregated_metrics = super().aggregate_fit(server_round, results, failures)
+
+            if aggregated_parameters is not None:
+                # Convert `Parameters` to `List[np.ndarray]`
+                aggregated_ndarrays: List[np.ndarray] = fl.common.parameters_to_ndarrays(aggregated_parameters)
+
+                # Save aggregated_ndarrays
+                print(f"Saving round {server_round} aggregated_ndarrays...")
+                np.savez(f"round-{server_round}-weights.npz", *aggregated_ndarrays)
+
+            return aggregated_parameters, aggregated_metrics
+
+    # Create strategy and run server
+    strategy = SaveModelStrategy(
+        # (same arguments as FedAvg here)
+    )
+    fl.server.start_server(strategy=strategy)
+
+
+Save and load PyTorch checkpoints
+---------------------------------
+
+Similar to the previous example but with a few extra steps, we'll show how to 
+store a PyTorch checkpoint we'll use the ``torch.save`` function.
+Firstly, ``aggregate_fit`` returns a ``Parameters`` object that has to be transformed into a list of NumPy ``ndarray``'s, 
+then those are transformed into the PyTorch ``state_dict`` following the ``OrderedDict`` class structure.
+
+.. code-block:: python
+
+    net = cifar.Net().to(DEVICE)
+    class SaveModelStrategy(fl.server.strategy.FedAvg):
+        def aggregate_fit(
+            self,
+            server_round: int,
+            results: List[Tuple[fl.server.client_proxy.ClientProxy, fl.common.FitRes]],
+            failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]],
+        ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]:
+            """Aggregate model weights using weighted average and store checkpoint"""
+
+            # Call aggregate_fit from base class (FedAvg) to aggregate parameters and metrics
+            aggregated_parameters, aggregated_metrics = super().aggregate_fit(server_round, results, failures)
+
+            if aggregated_parameters is not None:
+                print(f"Saving round {server_round} aggregated_parameters...")
+
+                # Convert `Parameters` to `List[np.ndarray]`
+                aggregated_ndarrays: List[np.ndarray] = fl.common.parameters_to_ndarrays(aggregated_parameters)
+
+                # Convert `List[np.ndarray]` to PyTorch`state_dict`
+                params_dict = zip(net.state_dict().keys(), aggregated_ndarrays)
+                state_dict = OrderedDict({k: torch.tensor(v) for k, v in params_dict})
+                net.load_state_dict(state_dict, strict=True)
+
+                # Save the model
+                torch.save(net.state_dict(), f"model_round_{server_round}.pth")
+
+            return aggregated_parameters, aggregated_metrics
+
+To load your progress, you simply append the following lines to your code. Note that this will iterate over all saved checkpoints and load the latest one:
+
+.. code-block:: python
+
+    list_of_files = [fname for fname in glob.glob("./model_round_*")]
+    latest_round_file = max(list_of_files, key=os.path.getctime)
+    print("Loading pre-trained model from: ", latest_round_file)
+    state_dict = torch.load(latest_round_file)
+    net.load_state_dict(state_dict)
@@ -1,5 +1,5 @@
-Strategies
-==========
+Use strategies
+==============
 
 Flower allows full customization of the learning process through the :code:`Strategy` abstraction. A number of built-in strategies are provided in the core framework.  
 

@@ -79,17 +79,16 @@ Problem-oriented how-to guides show step-by-step how to achieve a specific goal.
    :maxdepth: 1
    :caption: How-to guides
 
-   install-flower
-   configure-clients
-   strategies
-   implementing-strategies
-   save-progress
-   logging
-   saving-and-loading-pytorch-checkpoints
-   monitor-simulation
-   ssl-enabled-connections
-   recommended-env-setup
-   upgrade-to-flower-1.0
+   how-to-install-flower
+   how-to-configure-clients
+   how-to-use-strategies
+   how-to-implement-strategies
+   how-to-aggregate-evaluation-results
+   how-to-save-and-load-model-checkpoints
+   how-to-monitor-simulation
+   how-to-configure-logging
+   how-to-enable-ssl-connections
+   how-to-upgrade-to-flower-1.0
 
 .. toctree::
    :maxdepth: 1
@@ -148,6 +147,7 @@ intended to help along the way.
    first-time-contributors
    getting-started-for-contributors
    good-first-contributions
+   recommended-env-setup
    contributor-setup
    write-documentation
    architecture