Skip to content

Commit

Permalink
feat(server): Multiprocess migration and db_cleanup
Browse files Browse the repository at this point in the history
This patch allows the server to run the schema migration and
`db_cleanup` actions in parallel across the configured products, to
speed up these operations. Previously, the migration and cleanup ran in
a sequential job on all products, forfeiting the benefit of having
multiple CPUs on a server. As each product is a separate database and
there must not be any shared resource between products, it is safe to
run each migration in a separate process, in parallel.

Migrations and cleanup is prepared for scheduling in a deterministic
order, `ORDER BY endpoint`. (Previously it was done by the `ROWID`.)

The connection to the "config" database is released early on to
prevent a timeout on the unused and not changing configuration
database from crashing the server during a longer-running product
migration.

Added a facility to create beautiful logging output in migration
scripts and for the cleanup routines. This log output will now
necessarily include the product's `endpoint` identifier, as the log
messages are no longer sequential.
  • Loading branch information
whisperity committed Mar 25, 2024
1 parent 51bffbf commit 1a0be78
Show file tree
Hide file tree
Showing 65 changed files with 1,460 additions and 987 deletions.
6 changes: 2 additions & 4 deletions analyzer/codechecker_analyzer/cmd/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
import os
import shutil
import sys

import multiprocess
from typing import List

from tu_collector import tu_collector
Expand All @@ -31,6 +29,7 @@
from codechecker_analyzer.buildlog import log_parser

from codechecker_common import arg, logger, cmd_config, review_status_handler
from codechecker_common.compatibility.multiprocessing import cpu_count
from codechecker_common.skiplist_handler import SkipListHandler, \
SkipListHandlers
from codechecker_common.util import load_json
Expand Down Expand Up @@ -169,8 +168,7 @@ def add_arguments_to_parser(parser):
type=int,
dest="jobs",
required=False,
# pylint: disable=no-member
default=multiprocess.cpu_count(),
default=cpu_count(),
help="Number of threads to use in analysis. More "
"threads mean faster analysis at the cost of "
"using more memory.")
Expand Down
6 changes: 2 additions & 4 deletions analyzer/codechecker_analyzer/cmd/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@
import sys
import tempfile

import multiprocess

from codechecker_analyzer.analyzers import analyzer_types
from codechecker_analyzer.arg import \
OrderedCheckersAction, OrderedConfigAction, \
analyzer_config, checker_config, existing_abspath

from codechecker_common import arg, cmd_config, logger
from codechecker_common.compatibility.multiprocessing import cpu_count
from codechecker_common.source_code_comment_handler import \
REVIEW_STATUS_VALUES

Expand Down Expand Up @@ -183,8 +182,7 @@ def add_arguments_to_parser(parser):
type=int,
dest="jobs",
required=False,
# pylint: disable=no-member
default=multiprocess.cpu_count(),
default=cpu_count(),
help="Number of threads to use in analysis. "
"More threads mean faster analysis at "
"the cost of using more memory.")
Expand Down
7 changes: 7 additions & 0 deletions codechecker_common/compatibility/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -------------------------------------------------------------------------
#
# Part of the CodeChecker project, under the Apache License v2.0 with
# LLVM Exceptions. See LICENSE for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# -------------------------------------------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
#
# -------------------------------------------------------------------------
"""
Multiprocess compatibility module.
Multiprocessing compatibility module.
"""

import sys

# pylint: disable=unused-import
# pylint: disable=no-name-in-module,unused-import
if sys.platform in ["darwin", "win32"]:
from multiprocess import Pool as MultiProcessPool
from multiprocess import Pool # type: ignore
from multiprocess import cpu_count
else:
from concurrent.futures import ProcessPoolExecutor as MultiProcessPool
from concurrent.futures import ProcessPoolExecutor as Pool # type: ignore
from multiprocessing import cpu_count
14 changes: 11 additions & 3 deletions codechecker_common/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@
"""
Util module.
"""


import itertools
import json
from typing import TextIO
import os
from typing import TextIO

import portalocker

from codechecker_common.logger import get_logger
Expand All @@ -34,6 +33,15 @@ def arg_match(options, args):
return matched_args


def clamp(min_: int, value: int, max_: int) -> int:
"""
Clamps ``value`` to be between ``min_`` and ``max_``, inclusive.
"""
if min_ > max_:
raise ValueError("min <= max required")
return min(max(min_, value), max_)


def chunks(iterator, n):
"""
Yield the next chunk if an iterable object. A chunk consists of maximum n
Expand Down
53 changes: 29 additions & 24 deletions docs/web/db_schema_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ https://alembic.sqlalchemy.org/en/latest/autogenerate.html#what-does-autogenerat

# Updating configuration database schema

Config database schema scripts can be found under the `config_db_migrate`
directory.
Config database schema scripts can be found under the
`server/codechecker_server/migrations/config/versions` directory.

## Automatic migration script generation (Online)

Expand All @@ -30,20 +30,21 @@ version.
The configuration database schema file can be found here:
`server/codechecker_server/database/config_db_model.py`

### **Step 2**: Check the alembic.ini configuration settings
### **Step 2**: Check the `alembic.ini` configuration settings

Database connection should point to the correct database.
Edit the sqlalchemy.url option in [alembic.ini](
https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
according to your database configuration.
Edit the `sqlalchemy.url` option in
[alembic.ini](https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
according to your database configuration.

### **Step 3**: Use alembic to autogenerate migration scripts

`alembic --name config_db revision --autogenerate -m "Change description"`

### **Step 4**: Check the generated scripts

The new migration script
`config_db_migrate/versions/{hash}_change_description.py` is generated.
`migrations/config/versions/{hash}_change_description.py` is generated.
**You must always check the generated script because sometimes it isn't
correct.**

Expand All @@ -60,29 +61,34 @@ Don't forget to commit the migration script with your other changes.

## Automatic migration script generation (Online)

A Codechecker server should be started and a product should be configured with
A CodeChecker server should be started and a product should be configured with
a previous database schema version.

Product (run) database schema scripts can be found under the
`server/codechecker_server/migrations/report/versions` directory.

### **Step 1**: Update the database model

The run database schema file can be found here:
`server/codechecker_server/database/run_db_model.py`

### **Step 2**: Check alembic.ini configuration
### **Step 2**: Check `alembic.ini` configuration

Database connection should point to the correct database.
Edit the sqlalchemy.url option in [alembic.ini](
https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
according to your database configuration.
Edit the `sqlalchemy.url` option in
[alembic.ini](https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file)
according to your database configuration.

#### **Step 2**: Generating migration scripts using autogenerate

`alembic --name run_db revision --autogenerate -m "Change description"`

#### **Step 3**: Check the generated scripts
The new migration script db_migrate/versions/{hash}_change_description.py is
generated. **You must always check the generated script because sometimes it
isn't correct.**

The new migration script
`migrations/report/versions/{hash}_change_description.py` is generated.
**You must always check the generated script because sometimes it isn't
correct.**

#### **Step 4**: Run all test cases.

Expand Down Expand Up @@ -118,7 +124,7 @@ and the other is the run database (storing analysis reports).
If there is some schema mismatch and migration is needed you will get a
warning at server start.

## IMPORTANT before schema upgrade
## IMPORTANT: before schema upgrade

If there is some schema change it is recommended to create a full backup
of your configuration and run databases before running the migration.
Expand Down Expand Up @@ -187,17 +193,16 @@ command.
$ CodeChecker server --db-upgrade-schema Default
[15:01] - Checking configuration database ...
[15:01] - Database is OK.
[15:01] - Preparing schema upgrade for Default
[15:01] - Preparing schema upgrade for 'Default'
[WARNING] [15:01] - Please note after migration only newer CodeChecker versions can be used to start the server
[WARNING] [15:01] - It is advised to make a full backup of your run databases.
[15:01] - Checking: Default
[15:01] - [Default] Database schema mismatch: migration is available.
Do you want to upgrade 'Default' to new schema? Y(es)/n(o) y
[15:01] - [Default] Schema will be upgraded...
[15:01] - ========================
[15:01] - Upgrading: Default
[15:01] - Database schema mismatch: migration is available.
Do you want to upgrade to new schema? Y(es)/n(o) y
Upgrading schema ...
Done.
Database is OK.
[15:01] - ========================
[15:02] - [Default] Upgrading...
[15:03] - [Default] Done upgrading.
```

Schema upgrade can be done for multiple products in a row if the
Expand Down
4 changes: 2 additions & 2 deletions web/client/codechecker_client/blame_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from git.exc import InvalidGitRepositoryError, GitCommandError
from typing import Dict, Iterable, Optional

from codechecker_common.compatibility.multiprocessing import Pool
from codechecker_common.logger import get_logger
from codechecker_common.multiprocesspool import MultiProcessPool

LOG = get_logger('system')

Expand Down Expand Up @@ -115,7 +115,7 @@ def assemble_blame_info(
Returns the number of collected blame information.
"""
with MultiProcessPool() as executor:
with Pool() as executor:
file_blame_info = __collect_blame_info_for_files(
file_paths, executor.map)

Expand Down
23 changes: 14 additions & 9 deletions web/client/codechecker_client/cmd/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,24 @@
from codechecker_client import product
from codechecker_common import arg, logger, cmd_config
from codechecker_common.checker_labels import CheckerLabels
from codechecker_common.compatibility.multiprocessing import Pool
from codechecker_common.source_code_comment_handler import \
SourceCodeCommentHandler
from codechecker_common.util import load_json
from codechecker_common.multiprocesspool import MultiProcessPool

from codechecker_web.shared import webserver_context, host_check
from codechecker_web.shared.env import get_default_workspace

try:
from codechecker_client.blame_info import assemble_blame_info
except ImportError:
pass
def assemble_blame_info(_, __) -> int:
"""
Shim for cases where Git blame info is not gatherable due to
missing libraries.
"""
raise NotImplementedError()


LOG = logger.get_logger('system')

Expand Down Expand Up @@ -371,7 +377,7 @@ def filter_source_files_with_comments(
"""
jobs = file_report_positions.items()

with MultiProcessPool() as executor:
with Pool() as executor:
return get_source_file_with_comments(jobs, executor.map)


Expand Down Expand Up @@ -447,7 +453,7 @@ def assemble_zip(inputs,

LOG.debug("Processing report files ...")

with MultiProcessPool() as executor:
with Pool() as executor:
analyzer_result_file_reports = parse_analyzer_result_files(
analyzer_result_file_paths, checker_labels, executor.map)

Expand Down Expand Up @@ -562,14 +568,13 @@ def assemble_zip(inputs,
zipf, file_paths)

if stats.num_of_blame_information:
LOG.info("Collecting blame information done.")
LOG.info("Collecting blame information... Done.")
else:
LOG.info("No blame information found for source files.")
except NameError:
except NotImplementedError:
LOG.warning(
"Collecting blame information has been failed. Make sure "
"'git' is available on your system to hide this warning "
"message.")
"Failed to collect blame information. Make sure Git is "
"installed on your system.")

zipf.writestr('content_hashes.json', json.dumps(file_to_hash))

Expand Down
8 changes: 5 additions & 3 deletions web/server/codechecker_server/api/product_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,8 @@ def addProduct(self, product):
msg)

conn_str = SQLServer \
.from_cmdline_args(conn_str_args, IDENTIFIER, None, False, None) \
.from_cmdline_args(conn_str_args, product.endpoint, IDENTIFIER,
None, False, None) \
.get_connection_string()

is_rws_change_disabled = product.isReviewStatusChangeDisabled
Expand Down Expand Up @@ -534,8 +535,9 @@ def editProduct(self, product_id, new_config):
msg)

conn_str = SQLServer \
.from_cmdline_args(conn_str_args, IDENTIFIER, None,
False, None).get_connection_string()
.from_cmdline_args(conn_str_args, product.endpoint,
IDENTIFIER, None, False, None) \
.get_connection_string()

# If endpoint or database arguments change, the product
# configuration has changed so severely, that it needs
Expand Down
8 changes: 4 additions & 4 deletions web/server/codechecker_server/api/report_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -3400,8 +3400,8 @@ def removeRunReports(self, run_ids, report_filter, cmp_data):
# access timestamp to file entries to delay their removal (and avoid
# removing frequently accessed files). The same comment applies to
# removeRun() function.
db_cleanup.remove_unused_comments(self._Session)
db_cleanup.remove_unused_analysis_info(self._Session)
db_cleanup.remove_unused_comments(self._product)
db_cleanup.remove_unused_analysis_info(self._product)

return True

Expand Down Expand Up @@ -3445,8 +3445,8 @@ def removeRun(self, run_id, run_filter):
# error. An alternative solution can be adding a timestamp to file
# entries to delay their removal. The same comment applies to
# removeRunReports() function.
db_cleanup.remove_unused_comments(self._Session)
db_cleanup.remove_unused_analysis_info(self._Session)
db_cleanup.remove_unused_comments(self._product)
db_cleanup.remove_unused_analysis_info(self._product)

return bool(runs)

Expand Down
Loading

0 comments on commit 1a0be78

Please sign in to comment.