Ericsson · csordasmarton · Jan 19, 2021 · Nov 11, 2020 · Nov 23, 2020 · csordasmarton
@@ -444,6 +444,30 @@ def __cleanup_timeout():
     return __cleanup_timeout
 
 
+def collect_ctu_involved_files(result_handler, source_analyzer, output_dir):
+    """
+    This function collects the list of source files involved by CTU analysis.
+    The list of files are written to output_dir.
+    """
+    if source_analyzer.ANALYZER_NAME != ClangSA.ANALYZER_NAME:
+        return
+
+    involved_files = set()
+
+    involved_files.update(source_analyzer.get_analyzer_mentioned_files(
+        result_handler.analyzer_stdout))
+    involved_files.update(source_analyzer.get_analyzer_mentioned_files(
+        result_handler.analyzer_stderr))
+
+    out = os.path.join(output_dir, result_handler.analyzer_action_str)
+
+    if involved_files:
+        with open(out, 'w', encoding='utf-8', errors='ignore') as f:
+            f.write('\n'.join(involved_files))
+    elif os.path.exists(out):
+        os.remove(out)
+
+
 def check(check_data):
     """
     Invoke clang with an action which called by processes.
@@ -627,6 +651,9 @@ def __create_timeout(analyzer_process):
                     handle_failure(source_analyzer, rh, zip_file,
                                    result_base, actions_map)
 
+        collect_ctu_involved_files(rh, source_analyzer,
+                                   output_dirs['ctu_connections'])
+
         if skip_handler and os.path.exists(result_file):
             # We need to check the plist content because skipping
             # reports in headers can be done only this way.
@@ -702,19 +729,24 @@ def signal_handler(signum, frame):
                                 initargs=(checked_var,
                                           actions_num))
 
-    failed_dir = os.path.join(output_path, "failed")
     # If the analysis has failed, we help debugging.
+    failed_dir = os.path.join(output_path, "failed")
     if not os.path.exists(failed_dir):
         os.makedirs(failed_dir)
 
-    success_dir = os.path.join(output_path, "success")
-
     # Analysis was successful processing results.
+    success_dir = os.path.join(output_path, "success")
     if not os.path.exists(success_dir):
         os.makedirs(success_dir)
 
+    # Collect what other TUs were involved during CTU analysis.
+    ctu_connections_dir = os.path.join(output_path, "ctu_connections")
+    if not os.path.exists(ctu_connections_dir):
+        os.makedirs(ctu_connections_dir)
+
     output_dirs = {'success': success_dir,
-                   'failed': failed_dir}
+                   'failed': failed_dir,
+                   'ctu_connections': ctu_connections_dir}
 
     # Construct analyzer env.
     analyzer_environment = env.extend(context.path_env_extra,

@@ -209,6 +209,18 @@ def __check_ctu_analyze(self, output):
         self.assertIn("lib.c:3:", output)
         self.assertIn("[core.NullDereference]", output)
 
+        # We assume that only main.c has been analyzed with CTU and it involves
+        # lib.c during its analysis.
+        connections_dir = os.path.join(self.report_dir, 'ctu_connections')
+        connections_files = os.listdir(connections_dir)
+        self.assertEqual(len(connections_files), 1)
+
+        connections_file = connections_files[0]
+        self.assertTrue(connections_file.startswith('main.c'))
+
+        with open(os.path.join(connections_dir, connections_file)) as f:
+            self.assertTrue(f.readline().endswith('lib.c'))
+
     @skipUnlessCTUCapable
     def test_ctu_makefile_generation(self):
         """ Test makefile generation in CTU mode. """

@@ -13,5 +13,15 @@
     "directory": "/tmp",
     "command": "ccache gcc -o /dev/null -std=c11 hello.c",
     "file": "hello.c"
+  },
+  {
+    "directory": "/tmp",
+    "command": "g++ -o /dev/null ctu.cpp",
+    "file": "ctu.cpp"
+  },
+  {
+    "directory": "/tmp",
+    "command": "g++ -o /dev/null zero.cpp",
+    "file": "zero.cpp"
   }
 ]
@@ -0,0 +1,6 @@
+#include "zero.h"
+
+int main()
+{
+  int i = 1 / zero();
+}
@@ -0,0 +1,2 @@
+#include <vector>
+int zero() { return 0; }
@@ -0,0 +1 @@
+int zero();
@@ -13,8 +13,10 @@
 """
 
 
+import inspect
 import json
 import os
+import shutil
 import tempfile
 import unittest
 import zipfile
@@ -28,23 +30,21 @@ def setUp(self):
         dir_path = os.path.dirname(os.path.realpath(__file__))
         self._test_proj_dir = os.path.join(dir_path, 'project')
 
-    def test_file_existence(self):
-
         compile_json = os.path.join(self._test_proj_dir,
                                     'compile_command.json')
 
-        compile_cmd_data = {}
         with open(compile_json, mode='rb') as cmpjson:
-            compile_cmd_data = json.load(cmpjson)
+            self.compile_cmd_data = json.load(cmpjson)
 
         # Overwrite the directory paths.
         # This is needed because the tests run on different machines
         # so the directory path changes in each case.
-        for cmp in compile_cmd_data:
+        for cmp in self.compile_cmd_data:
             cmp['directory'] = self._test_proj_dir
 
+    def test_file_existence(self):
         zip_file_name = tempfile.mkstemp(suffix='.zip')[1]
-        tu_collector.zip_tu_files(zip_file_name, compile_cmd_data)
+        tu_collector.zip_tu_files(zip_file_name, self.compile_cmd_data)
 
         with zipfile.ZipFile(zip_file_name) as archive:
             files = archive.namelist()
@@ -60,3 +60,32 @@ def test_file_existence(self):
         self.assertTrue(
             any([path.endswith(os.path.join('/', 'hello.c')) for path in files]))
         self.assertIn('compilation_database.json', files)
+
+    def test_ctu_collection(self):
+        with tempfile.TemporaryDirectory() as ctu_deps_dir:
+            ctu_action = next(filter(lambda ba: ba['file'] == 'ctu.cpp',
+                                     self.compile_cmd_data))
+
+            hash_fun = dict(
+                inspect.getmembers(tu_collector))['__analyzer_action_hash']
+
+            with open(os.path.join(ctu_deps_dir, hash_fun(ctu_action)), 'w') \
+                    as f:
+                f.write(os.path.join(self._test_proj_dir, 'zero.cpp'))
+
+            with tempfile.NamedTemporaryFile(suffix='.zip') as zip_file:
+                tu_collector.zip_tu_files(zip_file.name, self.compile_cmd_data,
+                                          file_filter='ctu.cpp',
+                                          ctu_deps_dir=ctu_deps_dir)
+
+                with zipfile.ZipFile(zip_file.name) as archive:
+                    files = archive.namelist()
+
+        self.assertTrue(
+            any([path.endswith(os.path.join('/', 'vector')) for path in files]))
+        self.assertTrue(
+            any([path.endswith(os.path.join('/', 'ctu.cpp')) for path in files]))
+        self.assertTrue(
+            any([path.endswith(os.path.join('/', 'zero.cpp')) for path in files]))
+        self.assertTrue(
+            any([path.endswith(os.path.join('/', 'zero.h')) for path in files]))
@@ -17,6 +17,7 @@
 import argparse
 import collections
 import fnmatch
+import hashlib
 import json
 import logging
 import os
@@ -209,11 +210,63 @@ def __eliminate_argument(arg_vect, opt_string, has_arg=False):
         raise IOError(output)
 
 
-def __filter_compilation_db(compilation_db, filt):
-    return [
-        action for action in compilation_db if fnmatch.fnmatch(
-            action['file'],
-            filt)]
+def __analyzer_action_hash(build_action):
+    """
+    This function returns a hash of a build action. This hash algorithm is
+    duplicated based on the same algorithm in CodeChecker. It is important to
+    keep these algorithms in sync!!! Basically the hash is computed from the
+    build command except for -o flag. The exclusion is due to a special
+    behavior of "make" utility. See the full description in CodeChecker:
+    analyzer_action_str() contains the documentation of the issue.
+    """
+    source_file = os.path.normpath(
+        os.path.join(build_action['directory'], build_action['file']))
+
+    args = shlex.split(build_action['command'])
+    indices = [idx for idx, v in enumerate(args) if v.startswith('-o')]
+
+    for idx in reversed(indices):
+        # Output can be given separate or joint:
+        # -o a.out vs -oa.out
+        # In the first case we delete its argument too.
+        if args[idx] == '-o':
+            del args[idx]
+        del args[idx]
+
+    build_info = source_file + '_' + ' '.join(args)
+
+    return hashlib.md5(build_info.encode(errors='ignore')).hexdigest()
+
+
+def __get_ctu_buildactions(build_action, compilation_db, ctu_deps_dir):
+    """
+    CodeChecker collets which source files were involved in CTU analysis. This
+    function returns the build actions which describe the compilation of these
+    files.
+    build_action -- The build action object which is analyzed in CTU mode. Its
+                    CTU dependencies will be returned.
+    compilation_db -- A complete compilation database. This function returns
+                      some of its elements: the ones which describe the
+                      compilation of the files listed in the file under
+                      ctu_deps_dir belonging to build_action.
+    ctu_deps_dir -- A directory created by CodeChecker under the report
+                    directory. The files under this folder must contain a hash
+                    of a build action. See __analyzer_action_hash().
+    """
+    ctu_deps_file = next(
+        (f for f in os.listdir(ctu_deps_dir)
+        if __analyzer_action_hash(build_action) in f), None)
+
+    if not ctu_deps_file:
+        return
+
+    with open(os.path.join(ctu_deps_dir, ctu_deps_file),
+              encoding='utf-8', errors='ignore') as f:
+        files = list(map(lambda x: x.strip(), f.readlines()))
+
+    return filter(
+        lambda ba: os.path.join(ba['directory'], ba['file']) in files,
+        compilation_db)
 
 
 def get_dependent_headers(command, build_dir, collect_toolchain=True):
@@ -288,7 +341,8 @@ def add_sources_to_zip(zip_file, files):
                           "again!", f)
 
 
-def zip_tu_files(zip_file, compilation_database, write_mode='w'):
+def zip_tu_files(zip_file, compilation_database, file_filter='*',
+                 write_mode='w', ctu_deps_dir=None):
     """
     Collects all files to a zip file which are required for the compilation of
     the translation units described by the given compilation database.
@@ -299,9 +353,22 @@ def zip_tu_files(zip_file, compilation_database, write_mode='w'):
     zip_file -- A file name or a file object.
     compilation_database -- Either a path of the compilation database JSON file
                             or a list of the parsed JSON.
+    file_filter -- A glob used as file name filter. The files of a TU will be
+                   collected only if the "file" attribute of the build action
+                   matches this glob.
     write_mode -- The file opening mode of the zip_file. In case of 'a' the new
                   files are appended to the existing zip file, in case of 'w'
                   the files are added to a clean zip file.
+    ctu_deps_dir -- This directory contains a list of files. Each file belongs
+                    to a translation unit and lists what other files are
+                    involved during CTU analysis. These files and their
+                    included headers will also be compressed in the .zip file.
+                    File names in this directory must contain a hash of a the
+                    build command. See __analyzer_action_hash() documentation.
+                    When using this options, make sure to provide a compilation
+                    database in the first argument which contains the build
+                    commands of these files otherwise the files of this folder
+                    can't be identified.
     """
     if isinstance(compilation_database, str):
         with open(compilation_database, encoding="utf-8", errors="ignore") as f:
@@ -311,7 +378,22 @@ def zip_tu_files(zip_file, compilation_database, write_mode='w'):
     tu_files = set()
     error_messages = ''
 
-    for buildaction in compilation_database:
+    filtered_compilation_database = list(filter(
+        lambda action: fnmatch.fnmatch(action['file'], file_filter),
+        compilation_database))
+
+    if ctu_deps_dir:
+        involved_ctu_actions = []
+
+        for action in filtered_compilation_database:
+            involved_ctu_actions.extend(__get_ctu_buildactions(
+                action, compilation_database, ctu_deps_dir))
+
+        for action in involved_ctu_actions:
+            if action not in filtered_compilation_database:
+                filtered_compilation_database.append(action)
+
+    for buildaction in filtered_compilation_database:
         files, err = get_dependent_headers(
             buildaction['command'],
             buildaction['directory'])
@@ -402,13 +484,23 @@ def main():
                                "command database file specified at this path.")
 
     parser.add_argument('-f', '--filter', dest='filter',
-                        type=str, required=False,
+                        type=str, required=False, default='*',
                         help="If '--zip' option is given this flag restricts "
                              "the collection on the build actions of which "
                              "the compiled source file matches this path. "
                              "If '--dependents' option is given this flag "
                              "specify a header file to get source file "
                              "dependencies for. E.g.: /path/to/*/files")
+    parser.add_argument('--ctu-deps-dir', dest='ctu_deps_dir',
+                        type=str, required=False,
+                        help="When using 'CodeChecker analyze --ctu' command, "
+                             "the results go to an output directory. This "
+                             "folder contains a directory named "
+                             "'ctu_connections' which stores the information "
+                             "what TUs are involved during the analysis of "
+                             "each source file. Once this directory is "
+                             "provided, this tool also collects the sourced "
+                             "of TUs involved by CTU analysis.")
 
     output_args = parser.add_argument_group(
         "output arguments",
@@ -454,15 +546,15 @@ def main():
             'directory': os.getcwd()}]
 
     if args.zip:
-        if args.logfile and args.filter:
-            compilation_db = [
-                action for action in compilation_db if fnmatch.fnmatch(
-                    action['file'], args.filter)]
-        else:
-            LOG.warning('In case of using build command the filter has no '
+        if args.command and args.filter:
+            LOG.warning('In case of using build command --filter has no '
+                        'effect.')
+        if args.command and args.ctu_deps_dir:
+            LOG.warning('In case of using build command --ctu-deps-dir has no '
                         'effect.')
 
-        zip_tu_files(args.zip, compilation_db)
+        zip_tu_files(args.zip, compilation_db, args.filter,
+                     ctu_deps_dir=args.ctu_deps_dir)
         LOG.info("Done.")
     else:
         deps = get_dependent_sources(compilation_db, args.filter)