diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py index 16ba077aee6..09ea216ced3 100755 --- a/tools/extra/parse_log.py +++ b/tools/extra/parse_log.py @@ -3,7 +3,7 @@ """ Parse training log -Competitor to parse_log.sh +Evolved from parse_log.sh """ import os @@ -11,18 +11,7 @@ import extract_seconds import argparse import csv - - -def get_line_type(line): - """Return either 'test' or 'train' depending on line type - """ - - line_type = None - if line.find('Train') != -1: - line_type = 'train' - elif line.find('Test') != -1: - line_type = 'test' - return line_type +from collections import OrderedDict def parse_log(path_to_log): @@ -36,70 +25,112 @@ def parse_log(path_to_log): for the two dict_lists """ - re_iteration = re.compile('Iteration (\d+)') - re_accuracy = re.compile('output #\d+: accuracy = ([\.\d]+)') - re_train_loss = re.compile('Iteration \d+, loss = ([\.\d]+)') - re_output_loss = re.compile('output #\d+: loss = ([\.\d]+)') - re_lr = re.compile('lr = ([\.\d]+)') + regex_iteration = re.compile('Iteration (\d+)') + regex_train_output = re.compile('Train net output #(\d+): (\S+) = ([\.\deE+-]+)') + regex_test_output = re.compile('Test net output #(\d+): (\S+) = ([\.\deE+-]+)') + regex_learning_rate = re.compile('lr = ([\.\d]+)') # Pick out lines of interest iteration = -1 - test_accuracy = -1 learning_rate = float('NaN') train_dict_list = [] test_dict_list = [] - train_dict_names = ('NumIters', 'Seconds', 'TrainingLoss', 'LearningRate') - test_dict_names = ('NumIters', 'Seconds', 'TestAccuracy', 'TestLoss') + train_row = None + test_row = None logfile_year = extract_seconds.get_log_created_year(path_to_log) with open(path_to_log) as f: start_time = extract_seconds.get_start_time(f, logfile_year) for line in f: - iteration_match = re_iteration.search(line) + iteration_match = regex_iteration.search(line) if iteration_match: iteration = float(iteration_match.group(1)) if iteration == -1: - # Only look for other stuff if we've found the first iteration + # Only start parsing for other stuff if we've found the first + # iteration continue time = extract_seconds.extract_datetime_from_line(line, logfile_year) seconds = (time - start_time).total_seconds() - lr_match = re_lr.search(line) - if lr_match: - learning_rate = float(lr_match.group(1)) - - accuracy_match = re_accuracy.search(line) - if accuracy_match and get_line_type(line) == 'test': - test_accuracy = float(accuracy_match.group(1)) - - train_loss_match = re_train_loss.search(line) - if train_loss_match: - train_loss = float(train_loss_match.group(1)) - train_dict_list.append({'NumIters': iteration, - 'Seconds': seconds, - 'TrainingLoss': train_loss, - 'LearningRate': learning_rate}) - - output_loss_match = re_output_loss.search(line) - if output_loss_match and get_line_type(line) == 'test': - test_loss = float(output_loss_match.group(1)) - # NOTE: we assume that (1) accuracy always comes right before - # loss for test data so the test_accuracy variable is already - # correctly populated and (2) there's one and only one output - # named "accuracy" for the test net - test_dict_list.append({'NumIters': iteration, - 'Seconds': seconds, - 'TestAccuracy': test_accuracy, - 'TestLoss': test_loss}) - - return train_dict_list, train_dict_names, test_dict_list, test_dict_names - - -def save_csv_files(logfile_path, output_dir, train_dict_list, train_dict_names, - test_dict_list, test_dict_names, verbose=False): + learning_rate_match = regex_learning_rate.search(line) + if learning_rate_match: + learning_rate = float(learning_rate_match.group(1)) + + train_dict_list, train_row = parse_line_for_net_output( + regex_train_output, train_row, train_dict_list, + line, iteration, seconds, learning_rate + ) + test_dict_list, test_row = parse_line_for_net_output( + regex_test_output, test_row, test_dict_list, + line, iteration, seconds, learning_rate + ) + + fix_initial_nan_learning_rate(train_dict_list) + fix_initial_nan_learning_rate(test_dict_list) + + return train_dict_list, test_dict_list + + +def parse_line_for_net_output(regex_obj, row, row_dict_list, + line, iteration, seconds, learning_rate): + """Parse a single line for training or test output + + Returns a a tuple with (row_dict_list, row) + row: may be either a new row or an augmented version of the current row + row_dict_list: may be either the current row_dict_list or an augmented + version of the current row_dict_list + """ + + output_match = regex_obj.search(line) + if output_match: + if not row or row['NumIters'] != iteration: + # Push the last row and start a new one + if row: + # If we're on a new iteration, push the last row + # This will probably only happen for the first row; otherwise + # the full row checking logic below will push and clear full + # rows + row_dict_list.append(row) + + row = OrderedDict([ + ('NumIters', iteration), + ('Seconds', seconds), + ('LearningRate', learning_rate) + ]) + + # output_num is not used; may be used in the future + # output_num = output_match.group(1) + output_name = output_match.group(2) + output_val = output_match.group(3) + row[output_name] = float(output_val) + + if row and len(row_dict_list) >= 1 and len(row) == len(row_dict_list[0]): + # The row is full, based on the fact that it has the same number of + # columns as the first row; append it to the list + row_dict_list.append(row) + row = None + + return row_dict_list, row + + +def fix_initial_nan_learning_rate(dict_list): + """Correct initial value of learning rate + + Learning rate is normally not printed until after the initial test and + training step, which means the initial testing and training rows have + LearningRate = NaN. Fix this by copying over the LearningRate from the + second row, if it exists. + """ + + if len(dict_list) > 1: + dict_list[0]['LearningRate'] = dict_list[1]['LearningRate'] + + +def save_csv_files(logfile_path, output_dir, train_dict_list, test_dict_list, + delimiter=',', verbose=False): """Save CSV files to output_dir If the input log file is, e.g., caffe.INFO, the names will be @@ -108,18 +139,22 @@ def save_csv_files(logfile_path, output_dir, train_dict_list, train_dict_names, log_basename = os.path.basename(logfile_path) train_filename = os.path.join(output_dir, log_basename + '.train') - write_csv(train_filename, train_dict_list, train_dict_names, verbose) + write_csv(train_filename, train_dict_list, delimiter, verbose) test_filename = os.path.join(output_dir, log_basename + '.test') - write_csv(test_filename, test_dict_list, test_dict_names, verbose) + write_csv(test_filename, test_dict_list, delimiter, verbose) -def write_csv(output_filename, dict_list, header_names, verbose=False): +def write_csv(output_filename, dict_list, delimiter, verbose=False): """Write a CSV file """ + dialect = csv.excel + dialect.delimiter = delimiter + with open(output_filename, 'w') as f: - dict_writer = csv.DictWriter(f, header_names) + dict_writer = csv.DictWriter(f, fieldnames=dict_list[0].keys(), + dialect=dialect) dict_writer.writeheader() dict_writer.writerows(dict_list) if verbose: @@ -141,16 +176,20 @@ def parse_args(): action='store_true', help='Print some extra info (e.g., output filenames)') + parser.add_argument('--delimiter', + default=',', + help=('Column delimiter in output files ' + '(default: \'%(default)s\')')) + args = parser.parse_args() return args def main(): args = parse_args() - train_dict_list, train_dict_names, test_dict_list, test_dict_names = \ - parse_log(args.logfile_path) + train_dict_list, test_dict_list = parse_log(args.logfile_path) save_csv_files(args.logfile_path, args.output_dir, train_dict_list, - train_dict_names, test_dict_list, test_dict_names) + test_dict_list, delimiter=args.delimiter) if __name__ == '__main__':