feat(server): Multiprocess migration and db_cleanup

This patch allows the server to run the schema migration and `db_cleanup` actions in parallel across the configured products, to speed up these operations. Previously, the migration and cleanup ran in a sequential job on all products, forfeiting the benefit of having multiple CPUs on a server. As each product is a separate database and there must not be any shared resource between products, it is safe to run each migration in a separate process, in parallel. Migrations and cleanup is prepared for scheduling in a deterministic order, `ORDER BY endpoint`. (Previously it was done by the `ROWID`.) The connection to the "config" database is released early on to prevent a timeout on the unused and not changing configuration database from crashing the server during a longer-running product migration. Added a facility to create beautiful logging output in migration scripts and for the cleanup routines. This log output will now necessarily include the product's `endpoint` identifier, as the log messages are no longer sequential.
Ericsson · Feb 28, 2024 · a69b946 · a69b946
1 parent b544c58
commit a69b946
Show file tree

Hide file tree

Showing 65 changed files with 1,409 additions and 937 deletions.
diff --git a/analyzer/codechecker_analyzer/cmd/analyze.py b/analyzer/codechecker_analyzer/cmd/analyze.py
@@ -16,8 +16,6 @@
 import os
 import shutil
 import sys
-
-import multiprocess
 from typing import List
 
 from tu_collector import tu_collector
@@ -31,6 +29,7 @@
 from codechecker_analyzer.buildlog import log_parser
 
 from codechecker_common import arg, logger, cmd_config, review_status_handler
+from codechecker_common.compatibility.multiprocessing import cpu_count
 from codechecker_common.skiplist_handler import SkipListHandler, \
     SkipListHandlers
 from codechecker_common.util import load_json
@@ -169,8 +168,7 @@ def add_arguments_to_parser(parser):
                         type=int,
                         dest="jobs",
                         required=False,
-                        # pylint: disable=no-member
-                        default=multiprocess.cpu_count(),
+                        default=cpu_count(),
                         help="Number of threads to use in analysis. More "
                              "threads mean faster analysis at the cost of "
                              "using more memory.")

diff --git a/analyzer/codechecker_analyzer/cmd/check.py b/analyzer/codechecker_analyzer/cmd/check.py
@@ -17,14 +17,13 @@
 import sys
 import tempfile
 
-import multiprocess
-
 from codechecker_analyzer.analyzers import analyzer_types
 from codechecker_analyzer.arg import \
     OrderedCheckersAction, OrderedConfigAction, \
     analyzer_config, checker_config, existing_abspath
 
 from codechecker_common import arg, cmd_config, logger
+from codechecker_common.compatibility.multiprocessing import cpu_count
 from codechecker_common.source_code_comment_handler import \
     REVIEW_STATUS_VALUES
 
@@ -183,8 +182,7 @@ def add_arguments_to_parser(parser):
                                type=int,
                                dest="jobs",
                                required=False,
-                               # pylint: disable=no-member
-                               default=multiprocess.cpu_count(),
+                               default=cpu_count(),
                                help="Number of threads to use in analysis. "
                                     "More threads mean faster analysis at "
                                     "the cost of using more memory.")

diff --git a/codechecker_common/compatibility/__init__.py b/codechecker_common/compatibility/__init__.py
@@ -0,0 +1,7 @@
+# -------------------------------------------------------------------------
+#
+#  Part of the CodeChecker project, under the Apache License v2.0 with
+#  LLVM Exceptions. See LICENSE for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# -------------------------------------------------------------------------
diff --git a/codechecker_common/multiprocesspool.py → ...r_common/compatibility/multiprocessing.py b/codechecker_common/multiprocesspool.py → ...r_common/compatibility/multiprocessing.py
@@ -6,13 +6,14 @@
 #
 # -------------------------------------------------------------------------
 """
-Multiprocess compatibility module.
+Multiprocessing compatibility module.
 """
-
 import sys
 
-# pylint: disable=unused-import
+# pylint: disable=no-name-in-module,unused-import
 if sys.platform in ["darwin", "win32"]:
-    from multiprocess import Pool as MultiProcessPool
+    from multiprocess import Pool  # type: ignore
+    from multiprocess import cpu_count
 else:
-    from concurrent.futures import ProcessPoolExecutor as MultiProcessPool
+    from concurrent.futures import ProcessPoolExecutor as Pool  # type: ignore
+    from multiprocessing import cpu_count
diff --git a/codechecker_common/util.py b/codechecker_common/util.py
@@ -8,12 +8,11 @@
 """
 Util module.
 """
-
-
 import itertools
 import json
-from typing import TextIO
 import os
+from typing import TextIO
+
 import portalocker
 
 from codechecker_common.logger import get_logger
@@ -34,6 +33,15 @@ def arg_match(options, args):
     return matched_args
 
 
+def clamp(min_: int, value: int, max_: int) -> int:
+    """
+    Clamps ``value`` to be between ``min_`` and ``max_``, inclusive.
+    """
+    if min_ > max_:
+        raise ValueError("min <= max required")
+    return min(max(min_, value), max_)
+
+
 def chunks(iterator, n):
     """
     Yield the next chunk if an iterable object. A chunk consists of maximum n

diff --git a/docs/web/db_schema_guide.md b/docs/web/db_schema_guide.md
@@ -17,8 +17,8 @@ https://alembic.sqlalchemy.org/en/latest/autogenerate.html#what-does-autogenerat
 
 # Updating configuration database schema
 
-Config database schema scripts can be found under the `config_db_migrate`
-directory.
+Config database schema scripts can be found under the
+`server/codechecker_server/migrations/config/versions` directory.
 
 ## Automatic migration script generation (Online)
 
@@ -30,20 +30,21 @@ version.
 The configuration database schema file can be found here:
 `server/codechecker_server/database/config_db_model.py`
 
-### **Step 2**: Check the alembic.ini configuration settings
+### **Step 2**: Check the `alembic.ini` configuration settings
 
 Database connection should point to the correct database.
-Edit the sqlalchemy.url option in [alembic.ini](
-   https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
-   according to your database configuration.
+Edit the `sqlalchemy.url` option in
+[alembic.ini](https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
+according to your database configuration.
 
 ### **Step 3**: Use alembic to autogenerate migration scripts
 
 `alembic --name config_db revision --autogenerate -m "Change description"`
 
 ### **Step 4**: Check the generated scripts
+
 The new migration script
-`config_db_migrate/versions/{hash}_change_description.py` is generated.
+`migrations/config/versions/{hash}_change_description.py` is generated.
 **You must always check the generated script because sometimes it isn't
 correct.**
 
@@ -60,29 +61,34 @@ Don't forget to commit the migration script with your other changes.
 
 ## Automatic migration script generation (Online)
 
-A Codechecker server should be started and a product should be configured with
+A CodeChecker server should be started and a product should be configured with
 a previous database schema version.
 
+Product (run) database schema scripts can be found under the
+`server/codechecker_server/migrations/report/versions` directory.
+
 ### **Step 1**: Update the database model
 
 The run database schema file can be found here:
 `server/codechecker_server/database/run_db_model.py`
 
-### **Step 2**: Check alembic.ini configuration
+### **Step 2**: Check `alembic.ini` configuration
 
 Database connection should point to the correct database.
-Edit the sqlalchemy.url option in [alembic.ini](
-   https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
-   according to your database configuration.
+Edit the `sqlalchemy.url` option in
+[alembic.ini](https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
+according to your database configuration.
 
 #### **Step 2**: Generating migration scripts using autogenerate
 
 `alembic --name run_db revision --autogenerate -m "Change description"`
 
 #### **Step 3**: Check the generated scripts
-The new migration script db_migrate/versions/{hash}_change_description.py is
-   generated. **You must always check the generated script because sometimes it
-   isn't correct.**
+
+The new migration script
+`migrations/report/versions/{hash}_change_description.py` is generated.
+**You must always check the generated script because sometimes it isn't
+correct.**
 
 #### **Step 4**: Run all test cases.
 
@@ -118,7 +124,7 @@ and the other is the run database (storing analysis reports).
 If there is some schema mismatch and migration is needed you will get a
 warning at server start.
 
-## IMPORTANT before schema upgrade
+## IMPORTANT: before schema upgrade
 
 If there is some schema change it is recommended to create a full backup
 of your configuration and run databases before running the migration.
@@ -187,17 +193,16 @@ command.
 $ CodeChecker server --db-upgrade-schema Default
 [15:01] - Checking configuration database ...
 [15:01] - Database is OK.
-[15:01] - Preparing schema upgrade for Default
+[15:01] - Preparing schema upgrade for 'Default'
 [WARNING] [15:01] - Please note after migration only newer CodeChecker versions can be used to start the server
 [WARNING] [15:01] - It is advised to make a full backup of your run databases.
+[15:01] - Checking: Default
+[15:01] - [Default] Database schema mismatch: migration is available.
+Do you want to upgrade 'Default' to new schema? Y(es)/n(o) y
+[15:01] - [Default] Schema will be upgraded...
 [15:01] - ========================
-[15:01] - Upgrading: Default
-[15:01] - Database schema mismatch: migration is available.
-Do you want to upgrade to new schema? Y(es)/n(o) y
-Upgrading schema ...
-Done.
-Database is OK.
-[15:01] - ========================
+[15:02] - [Default] Upgrading...
+[15:03] - [Default] Done upgrading.
 ```
 
 Schema upgrade can be done for multiple products in a row if the

diff --git a/web/client/codechecker_client/blame_info.py b/web/client/codechecker_client/blame_info.py
@@ -6,8 +6,8 @@
 from git.exc import InvalidGitRepositoryError
 from typing import Dict, Iterable, Optional
 
+from codechecker_common.compatibility.multiprocessing import Pool
 from codechecker_common.logger import get_logger
-from codechecker_common.multiprocesspool import MultiProcessPool
 
 LOG = get_logger('system')
 
@@ -112,7 +112,7 @@ def assemble_blame_info(
 
     Returns the number of collected blame information.
     """
-    with MultiProcessPool() as executor:
+    with Pool() as executor:
         file_blame_info = __collect_blame_info_for_files(
             file_paths, executor.map)
 

diff --git a/web/client/codechecker_client/cmd/store.py b/web/client/codechecker_client/cmd/store.py
@@ -42,18 +42,24 @@
 from codechecker_client import product
 from codechecker_common import arg, logger, cmd_config
 from codechecker_common.checker_labels import CheckerLabels
+from codechecker_common.compatibility.multiprocessing import Pool
 from codechecker_common.source_code_comment_handler import \
     SourceCodeCommentHandler
 from codechecker_common.util import load_json
-from codechecker_common.multiprocesspool import MultiProcessPool
 
 from codechecker_web.shared import webserver_context, host_check
 from codechecker_web.shared.env import get_default_workspace
 
 try:
     from codechecker_client.blame_info import assemble_blame_info
 except ImportError:
-    pass
+    def assemble_blame_info(_, __) -> int:
+        """
+        Shim for cases where Git blame info is not gatherable due to
+        missing libraries.
+        """
+        raise NotImplementedError()
+
 
 LOG = logger.get_logger('system')
 
@@ -371,7 +377,7 @@ def filter_source_files_with_comments(
     """
     jobs = file_report_positions.items()
 
-    with MultiProcessPool() as executor:
+    with Pool() as executor:
         return get_source_file_with_comments(jobs, executor.map)
 
 
@@ -447,7 +453,7 @@ def assemble_zip(inputs,
 
     LOG.debug("Processing report files ...")
 
-    with MultiProcessPool() as executor:
+    with Pool() as executor:
         analyzer_result_file_reports = parse_analyzer_result_files(
              analyzer_result_file_paths, checker_labels, executor.map)
 
@@ -562,14 +568,13 @@ def assemble_zip(inputs,
                     zipf, file_paths)
 
                 if stats.num_of_blame_information:
-                    LOG.info("Collecting blame information done.")
+                    LOG.info("Collecting blame information... Done.")
                 else:
                     LOG.info("No blame information found for source files.")
-            except NameError:
+            except NotImplementedError:
                 LOG.warning(
-                    "Collecting blame information has been failed. Make sure "
-                    "'git' is available on your system to hide this warning "
-                    "message.")
+                    "Failed to collect blame information. Make sure Git is "
+                    "installed on your system.")
 
         zipf.writestr('content_hashes.json', json.dumps(file_to_hash))
 

diff --git a/web/server/codechecker_server/api/product_server.py b/web/server/codechecker_server/api/product_server.py
@@ -376,7 +376,8 @@ def addProduct(self, product):
                 msg)
 
         conn_str = SQLServer \
-            .from_cmdline_args(conn_str_args, IDENTIFIER, None, False, None) \
+            .from_cmdline_args(conn_str_args, product.endpoint, IDENTIFIER,
+                               None, False, None) \
             .get_connection_string()
 
         is_rws_change_disabled = product.isReviewStatusChangeDisabled
@@ -534,8 +535,9 @@ def editProduct(self, product_id, new_config):
                     msg)
 
             conn_str = SQLServer \
-                .from_cmdline_args(conn_str_args, IDENTIFIER, None,
-                                   False, None).get_connection_string()
+                .from_cmdline_args(conn_str_args, product.endpoint,
+                                   IDENTIFIER, None, False, None) \
+                .get_connection_string()
 
             # If endpoint or database arguments change, the product
             # configuration has changed so severely, that it needs

diff --git a/web/server/codechecker_server/api/report_server.py b/web/server/codechecker_server/api/report_server.py
@@ -3400,8 +3400,8 @@ def removeRunReports(self, run_ids, report_filter, cmp_data):
         # access timestamp to file entries to delay their removal (and avoid
         # removing frequently accessed files). The same comment applies to
         # removeRun() function.
-        db_cleanup.remove_unused_comments(self._Session)
-        db_cleanup.remove_unused_analysis_info(self._Session)
+        db_cleanup.remove_unused_comments(self._product)
+        db_cleanup.remove_unused_analysis_info(self._product)
 
         return True
 
@@ -3445,8 +3445,8 @@ def removeRun(self, run_id, run_filter):
         # error. An alternative solution can be adding a timestamp to file
         # entries to delay their removal. The same comment applies to
         # removeRunReports() function.
-        db_cleanup.remove_unused_comments(self._Session)
-        db_cleanup.remove_unused_analysis_info(self._Session)
+        db_cleanup.remove_unused_comments(self._product)
+        db_cleanup.remove_unused_analysis_info(self._product)
 
         return bool(runs)