diff --git a/memory_profiler.py b/memory_profiler.py index f6ac274..0ef4e82 100644 --- a/memory_profiler.py +++ b/memory_profiler.py @@ -16,7 +16,7 @@ import inspect import subprocess import logging - +from collections import defaultdict # TODO: provide alternative when multiprocessing is not available try: @@ -112,10 +112,10 @@ def _get_child_memory(process, meminfo_attr=None): # Loop over the child processes and yield their memory try: for child in getattr(process, children_attr)(recursive=True): - yield getattr(child, meminfo_attr)()[0] / _TWO_20 + yield child.pid, getattr(child, meminfo_attr)()[0] / _TWO_20 except psutil.NoSuchProcess: # https://github.com/fabianp/memory_profiler/issues/71 - yield 0.0 + yield (0,0.0) # need to yield a tuple def _get_memory(pid, backend, timestamps=False, include_children=False, filename=None): @@ -143,7 +143,7 @@ def ps_util_tool(): else 'get_memory_info' mem = getattr(process, meminfo_attr)()[0] / _TWO_20 if include_children: - mem += sum(_get_child_memory(process, meminfo_attr)) + mem += sum((mem for (pid,mem) in _get_child_memory(process, meminfo_attr))) if timestamps: return mem, time.time() else: @@ -355,14 +355,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False, # Write children to the stream file if multiprocess: - for idx, chldmem in enumerate(_get_child_memory(proc.pid)): - stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time())) + for chldpid, chldmem in _get_child_memory(proc.pid): + stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(chldpid, chldmem, time.time())) else: # Create a nested list with the child memory if multiprocess: mem_usage = [mem_usage] - for chldmem in _get_child_memory(proc.pid): - mem_usage.append(chldmem) + for chldpid, chldmem in _get_child_memory(proc.pid): + mem_usage.append((chldpid,chldmem)) # Append the memory usage to the return value ret.append(mem_usage) @@ -399,14 +399,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False, # Write children to the stream file if multiprocess: - for idx, chldmem in enumerate(_get_child_memory(proc.pid)): - stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time())) + for child_pid, chldmem in _get_child_memory(proc): + stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(child_pid, chldmem, time.time())) else: # Create a nested list with the child memory if multiprocess: mem_usage = [mem_usage] - for chldmem in _get_child_memory(proc.pid): - mem_usage.append(chldmem) + for chldpid, chldmem in _get_child_memory(proc): + mem_usage.append((chldpid,chldmem)) # Append the memory usage to the return value ret.append(mem_usage) @@ -1207,3 +1207,168 @@ def flush(self): prof.show_results(stream=out_file) else: show_results(prof, precision=options.precision, stream=out_file) + + +### I/O + +def read_mprofile_file(filename): + """Read an mprofile file and return its content. + + Returns + ------- + content: dict + Keys: + + - "mem_usage": (list) memory usage values, in MiB + - "timestamp": (list) time instant for each memory usage value, in + second + - "func_timestamp": (dict) for each function, timestamps and memory + usage upon entering and exiting. + - 'cmd_line': (str) command-line ran for this profile. + """ + func_ts = {} + mem_usage = [] + timestamp = [] + children = defaultdict(list) + cmd_line = None + f = open(filename, "r") + for l in f: + if l == '\n': + raise ValueError('Sampling time was too short') + field, value = l.split(' ', 1) + if field == "MEM": + # mem, timestamp + values = value.split(' ') + mem_usage.append(float(values[0])) + timestamp.append(float(values[1])) + + elif field == "FUNC": + values = value.split(' ') + f_name, mem_start, start, mem_end, end = values[:5] + ts = func_ts.get(f_name, []) + ts.append([float(start), float(end), + float(mem_start), float(mem_end)]) + func_ts[f_name] = ts + + elif field == "CHLD": + values = value.split(' ') + chldnum = values[0] + children[chldnum].append( + (float(values[1]), float(values[2])) + ) + + elif field == "CMDLINE": + cmd_line = value + else: + pass + f.close() + + return {"mem_usage": mem_usage, "timestamp": timestamp, + "func_timestamp": func_ts, 'filename': filename, + 'cmd_line': cmd_line, 'children': children} + + +def read_mprofile_file_multiprocess(filename): + """Read an mprofile file and return a mem_usage list + + Returns + ------- + content: list + + This is analogous to the list obtained when the `memory_usage` is used + """ + + mem_usage = [] + sample = [] + + f = open(filename,'r') + + for i,l in enumerate(f): + if l == '\n': + raise ValueError('Sampling time was too short') + field, value = l.split(' ', 1) + values = value.split(' ') + + if field=="MEM": + # append the existing sample and reset to zero + mem_usage.append(sample) + sample = [] + sample.append((float(values[0]), float(values[1]))) + elif field=="CHLD": + sample.append((int(values[0]), float(values[1]))) + + f.close() + return mem_usage[1:] + + +def convert_mem_usage_to_df(filename, is_pickle=False): + """Convert a `mem_usage` list to a `pandas.DataFrame` + + Parameters + ---------- + filename: path to the memory profile data; can be either a file + created by mprof or a pickle of the result of `memory_usage` + + is_pickle: if True, assume the data is the pickled list + returned by `memory_usage` + + + Returns + ------- + content: pandas.DataFrame + + Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index + """ + + try: + import pandas as pd + import numpy as np + except ImportError: + raise ImportError('Pandas and numpy are required for conversion to DataFrame') + + if is_pickle: + from cPickle import load + with open(filename) as f: + mem_usage = load(f) + + else: + mem_usage = read_mprofile_file_multiprocess(filename) + mem_usage = filter(lambda m: len(m) > 1, mem_usage) + + times =[m[0][1] for m in mem_usage] + + # flatten list of lists, extract the pids and attach '0' (parent) at the end + pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)] + [0,])) + + time_lookup = {time: i for i,time in enumerate(times)} + pid_lookup = {pid:i for i,pid in enumerate(pids)} + + data = np.zeros((len(times), len(pids))) + + for i,m in enumerate(mem_usage): + t = m[0][1] + + # add the parent memory by hand + data[time_lookup[t]][pid_lookup[0]] = m[0][0] + + for pid,mem in m[1:]: + data[time_lookup[t]][pid_lookup[pid]] = mem + + return pd.DataFrame(data, index=times, columns=pids) + +def plot_mem_usage(filename, include_parent=True, plot_total=True, is_pickle=False): + import matplotlib.pylab as plt + + data_df = convert_mem_usage_to_df(filename, is_pickle) + + f = plt.figure(figsize=(10,6)) + + if not include_parent: + data_df = data_df[data_df.columns[1:]] + + data_df.plot(legend=False, figsize=(14,10), grid=True, fontsize=14) + + if plot_total: + data_df.sum(axis=1).plot(style='--', grid=True) + + plt.xlabel('timestamp'); plt.ylabel('memory usage in MB') \ No newline at end of file diff --git a/mprof b/mprof index 31811cb..26653be 100755 --- a/mprof +++ b/mprof @@ -199,11 +199,17 @@ def run_action(): parser.add_option("--multiprocess", "-M", dest="multiprocess", default=False, action="store_true", help="""Monitors forked processes creating individual plots for each child""") + parser.add_option("--pid", "-p", dest="pid", + default=False, action="store_true", + help="""Specify that the argument is a running pid not an executable or script""") + parser.add_option("--timeout", dest="timeout", + default=None, action="store", type=int, + help="""Timeout in seconds""") (options, args) = parser.parse_args() - if len(args) == 0: - print("A program to run must be provided. Use -h for help") + if (len(args) == 0): + print("A program to run or a pid must be provided. Use -h for help") sys.exit(1) print("{1}: Sampling memory every {0.interval}s".format( @@ -218,30 +224,36 @@ def run_action(): mprofile_output = "mprofile_%s.dat" % suffix # .. TODO: more than one script as argument ? .. - if args[0].endswith('.py') and not options.nopython: - if not args[0].startswith("python"): - args.insert(0, "python") - if options.multiprocess: - # in multiprocessing mode you want to spawn a separate - # python process - options.python = False - if options.python: - print("running as a Python program...") - if not args[0].startswith("python"): - args.insert(0, "python") - cmd_line = get_cmd_line(args) - args[1:1] = ("-m", "memory_profiler", "--timestamp", - "-o", mprofile_output) - p = subprocess.Popen(args) + if not options.pid: + if args[0].endswith('.py') and not options.nopython: + if not args[0].startswith("python"): + args.insert(0, "python") + if options.multiprocess: + # in multiprocessing mode you want to spawn a separate + # python process + options.python = False + if options.python: + print("running as a Python program...") + if not args[0].startswith("python"): + args.insert(0, "python") + cmd_line = get_cmd_line(args) + args[1:1] = ("-m", "memory_profiler", "--timestamp", + "-o", mprofile_output) + p = subprocess.Popen(args) + else: + cmd_line = get_cmd_line(args) + p = subprocess.Popen(args) else: - cmd_line = get_cmd_line(args) - p = subprocess.Popen(args) + p = int(args[0]) with open(mprofile_output, "a") as f: - f.write("CMDLINE {0}\n".format(cmd_line)) + if not options.pid: + f.write("CMDLINE {0}\n".format(cmd_line)) + mp.memory_usage(proc=p, interval=options.interval, timestamps=True, include_children=options.include_children, - multiprocess=options.multiprocess, stream=f) + multiprocess=options.multiprocess, stream=f, + timeout=options.timeout) def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None): @@ -291,61 +303,6 @@ def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None): ## pl.plot(xloc[1], yloc[1], ">"+color, markersize=7) -def read_mprofile_file(filename): - """Read an mprofile file and return its content. - - Returns - ======= - content: dict - Keys: - - - "mem_usage": (list) memory usage values, in MiB - - "timestamp": (list) time instant for each memory usage value, in - second - - "func_timestamp": (dict) for each function, timestamps and memory - usage upon entering and exiting. - - 'cmd_line': (str) command-line ran for this profile. - """ - func_ts = {} - mem_usage = [] - timestamp = [] - children = defaultdict(list) - cmd_line = None - f = open(filename, "r") - for l in f: - if l == '\n': - raise ValueError('Sampling time was too short') - field, value = l.split(' ', 1) - if field == "MEM": - # mem, timestamp - values = value.split(' ') - mem_usage.append(float(values[0])) - timestamp.append(float(values[1])) - - elif field == "FUNC": - values = value.split(' ') - f_name, mem_start, start, mem_end, end = values[:5] - ts = func_ts.get(f_name, []) - ts.append([float(start), float(end), - float(mem_start), float(mem_end)]) - func_ts[f_name] = ts - - elif field == "CHLD": - values = value.split(' ') - chldnum = values[0] - children[chldnum].append( - (float(values[1]), float(values[2])) - ) - - elif field == "CMDLINE": - cmd_line = value - else: - pass - f.close() - - return {"mem_usage": mem_usage, "timestamp": timestamp, - "func_timestamp": func_ts, 'filename': filename, - 'cmd_line': cmd_line, 'children': children} def plot_file(filename, index=0, timestamps=True, children=True, options=None): @@ -355,7 +312,7 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None): print("matplotlib is needed for plotting.") sys.exit(1) import numpy as np # pylab requires numpy anyway - mprofile = read_mprofile_file(filename) + mprofile = mp.read_mprofile_file(filename) if len(mprofile['timestamp']) == 0: print('** No memory usage values have been found in the profile ' @@ -413,7 +370,7 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None): cmem = np.asarray([item[0] for item in data]) # Plot the line to the figure - pl.plot(cts, cmem, "+-" + mem_line_colors[idx+1 % len(mem_line_colors)], + pl.plot(cts, cmem, "+-" + mem_line_colors[(idx+1) % len(mem_line_colors)], label="child {}".format(proc)) # Detect the maximal child memory point