* memory benchmark rss * have both forward pass and line-by-line mem tracing * cleaned up tracing * refactored and cleaning up API * no f-strings yet... * add GPU mem logging * fix GPU memory monitoring * style and quality * clean up and doc * update with comments * Switching to python 3.6+ * fix quality
342 lines
15 KiB
Python
342 lines
15 KiB
Python
"""
|
|
Utilities for working with the local dataset cache.
|
|
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
|
|
Copyright by the AllenNLP authors.
|
|
"""
|
|
|
|
import linecache
|
|
import logging
|
|
import os
|
|
import sys
|
|
from collections import defaultdict
|
|
from typing import Iterable, List, NamedTuple, Optional, Union
|
|
|
|
from .file_utils import is_tf_available, is_torch_available
|
|
|
|
|
|
if is_torch_available():
|
|
from torch.cuda import empty_cache as torch_empty_cache
|
|
if is_tf_available():
|
|
from tensorflow.python.eager import context as tf_context
|
|
|
|
|
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
|
|
|
|
|
_is_memory_tracing_enabled = False
|
|
|
|
|
|
def is_memory_tracing_enabled():
|
|
global _is_memory_tracing_enabled
|
|
return _is_memory_tracing_enabled
|
|
|
|
|
|
class Frame(NamedTuple):
|
|
""" `Frame` is a NamedTuple used to gather the current frame state.
|
|
`Frame` has the following fields:
|
|
- 'filename' (string): Name of the file currently executed
|
|
- 'module' (string): Name of the module currently executed
|
|
- 'line_number' (int): Number of the line currently executed
|
|
- 'event' (string): Event that triggered the tracing (default will be "line")
|
|
- 'line_text' (string): Text of the line in the python script
|
|
"""
|
|
|
|
filename: str
|
|
module: str
|
|
line_number: int
|
|
event: str
|
|
line_text: str
|
|
|
|
|
|
class UsedMemoryState(NamedTuple):
|
|
""" `UsedMemoryState` are named tuples with the following fields:
|
|
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
|
|
- 'cpu_memory': CPU RSS memory state *before* executing the line
|
|
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
|
|
"""
|
|
|
|
frame: Frame
|
|
cpu_memory: int
|
|
gpu_memory: int
|
|
|
|
|
|
class Memory(NamedTuple):
|
|
""" `Memory` NamedTuple have a single field `bytes` and
|
|
you can get a human readable string of the number of bytes by calling `__repr__`
|
|
- `byte` (integer): number of bytes,
|
|
"""
|
|
|
|
bytes: int
|
|
|
|
def __repr__(self) -> str:
|
|
return bytes_to_human_readable(self.bytes)
|
|
|
|
|
|
class MemoryState(NamedTuple):
|
|
""" `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
|
|
- `frame` (`Frame`): the current frame (see above)
|
|
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
|
|
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
|
|
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
|
|
"""
|
|
|
|
frame: Frame
|
|
cpu: Memory
|
|
gpu: Memory
|
|
cpu_gpu: Memory
|
|
|
|
|
|
class MemorySummary(NamedTuple):
|
|
""" `MemorySummary` namedtuple otherwise with the fields:
|
|
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
|
|
by substracting the memory after executing each line from the memory before executing said line.
|
|
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
|
obtained by summing repeted memory increase for a line if it's executed several times.
|
|
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
|
|
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
|
|
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
|
|
"""
|
|
|
|
sequential: List[MemoryState]
|
|
cumulative: List[MemoryState]
|
|
total: Memory
|
|
|
|
|
|
MemoryTrace = List[UsedMemoryState]
|
|
|
|
|
|
def start_memory_tracing(
|
|
modules_to_trace: Optional[Union[str, Iterable[str]]] = None,
|
|
modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None,
|
|
events_to_trace: str = "line",
|
|
gpus_to_trace: Optional[List[int]] = None,
|
|
) -> MemoryTrace:
|
|
""" Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
|
|
See `../../examples/benchmarks.py for a usage example.
|
|
Current memory consumption is returned using psutil and in particular is the RSS memory
|
|
"Resident Set Size” (the non-swapped physical memory the process is using).
|
|
See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
|
|
|
|
Args:
|
|
- `modules_to_trace`: (None, string, list/tuple of string)
|
|
if None, all events are recorded
|
|
if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
|
|
- `modules_not_to_trace`: (None, string, list/tuple of string)
|
|
if None, no module is avoided
|
|
if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
|
|
- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
|
|
default to line
|
|
- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
|
|
|
|
Return:
|
|
- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
|
|
- `UsedMemoryState` are named tuples with the following fields:
|
|
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
|
|
- 'cpu_memory': CPU RSS memory state *before* executing the line
|
|
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
|
|
|
|
`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
|
|
`Frame` has the following fields:
|
|
- 'filename' (string): Name of the file currently executed
|
|
- 'module' (string): Name of the module currently executed
|
|
- 'line_number' (int): Number of the line currently executed
|
|
- 'event' (string): Event that triggered the tracing (default will be "line")
|
|
- 'line_text' (string): Text of the line in the python script
|
|
|
|
"""
|
|
try:
|
|
import psutil
|
|
except (ImportError):
|
|
logger.warning(
|
|
"Psutil not installed, we won't log CPU memory usage. "
|
|
"Install psutil (pip install psutil) to use CPU memory tracing."
|
|
)
|
|
process = None
|
|
else:
|
|
process = psutil.Process(os.getpid())
|
|
|
|
try:
|
|
from py3nvml import py3nvml
|
|
|
|
py3nvml.nvmlInit()
|
|
devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
|
|
py3nvml.nvmlShutdown()
|
|
except ImportError:
|
|
logger.warning(
|
|
"py3nvml not installed, we won't log GPU memory usage. "
|
|
"Install py3nvml (pip install py3nvml) to use GPU memory tracing."
|
|
)
|
|
log_gpu = False
|
|
except (OSError, py3nvml.NVMLError):
|
|
logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
|
|
log_gpu = False
|
|
else:
|
|
log_gpu = is_torch_available() or is_tf_available()
|
|
|
|
memory_trace = []
|
|
|
|
def traceit(frame, event, args):
|
|
""" Tracing method executed before running each line in a module or sub-module
|
|
Record memory allocated in a list with debugging information
|
|
"""
|
|
global _is_memory_tracing_enabled
|
|
|
|
if not _is_memory_tracing_enabled:
|
|
return traceit
|
|
|
|
# Filter events
|
|
if events_to_trace is not None:
|
|
if isinstance(events_to_trace, str) and event != events_to_trace:
|
|
return traceit
|
|
elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
|
|
return traceit
|
|
|
|
# Filter modules
|
|
name = frame.f_globals["__name__"]
|
|
if not isinstance(name, str):
|
|
return traceit
|
|
else:
|
|
# Filter whitelist of modules to trace
|
|
if modules_to_trace is not None:
|
|
if isinstance(modules_to_trace, str) and modules_to_trace not in name:
|
|
return traceit
|
|
elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):
|
|
return traceit
|
|
|
|
# Filter blacklist of modules not to trace
|
|
if modules_not_to_trace is not None:
|
|
if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:
|
|
return traceit
|
|
elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):
|
|
return traceit
|
|
|
|
# Record current tracing state (file, location in file...)
|
|
lineno = frame.f_lineno
|
|
filename = frame.f_globals["__file__"]
|
|
if filename.endswith(".pyc") or filename.endswith(".pyo"):
|
|
filename = filename[:-1]
|
|
line = linecache.getline(filename, lineno).rstrip()
|
|
traced_state = Frame(filename, name, lineno, event, line)
|
|
|
|
# Record current memory state (rss memory) and compute difference with previous memory state
|
|
cpu_mem = 0
|
|
if process is not None:
|
|
mem = process.memory_info()
|
|
cpu_mem = mem.rss
|
|
|
|
gpu_mem = 0
|
|
if log_gpu:
|
|
# Clear GPU caches
|
|
if is_torch_available():
|
|
torch_empty_cache()
|
|
if is_tf_available():
|
|
tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
|
|
|
|
# Sum used memory for all GPUs
|
|
py3nvml.nvmlInit()
|
|
for i in devices:
|
|
handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
|
|
meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
|
|
gpu_mem += meminfo.used
|
|
py3nvml.nvmlShutdown()
|
|
|
|
mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
|
|
memory_trace.append(mem_state)
|
|
|
|
return traceit
|
|
|
|
sys.settrace(traceit)
|
|
|
|
global _is_memory_tracing_enabled
|
|
_is_memory_tracing_enabled = True
|
|
|
|
return memory_trace
|
|
|
|
|
|
def stop_memory_tracing(
|
|
memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
|
|
) -> Optional[MemorySummary]:
|
|
""" Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
|
|
|
|
Args:
|
|
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
|
|
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
|
|
|
|
Return:
|
|
- None if `memory_trace` is None
|
|
- `MemorySummary` namedtuple otherwise with the fields:
|
|
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
|
|
by substracting the memory after executing each line from the memory before executing said line.
|
|
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
|
obtained by summing repeted memory increase for a line if it's executed several times.
|
|
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
|
|
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
|
|
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
|
|
|
|
`Memory` named tuple have fields
|
|
- `byte` (integer): number of bytes,
|
|
- `string` (string): same as human readable string (ex: "3.5MB")
|
|
|
|
`Frame` are namedtuple used to list the current frame state and have the following fields:
|
|
- 'filename' (string): Name of the file currently executed
|
|
- 'module' (string): Name of the module currently executed
|
|
- 'line_number' (int): Number of the line currently executed
|
|
- 'event' (string): Event that triggered the tracing (default will be "line")
|
|
- 'line_text' (string): Text of the line in the python script
|
|
|
|
`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
|
|
- `frame` (`Frame`): the current frame (see above)
|
|
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
|
|
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
|
|
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
|
|
"""
|
|
global _is_memory_tracing_enabled
|
|
_is_memory_tracing_enabled = False
|
|
|
|
if memory_trace is not None and len(memory_trace) > 1:
|
|
memory_diff_trace = []
|
|
cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
|
|
for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip(
|
|
memory_trace[:-1], memory_trace[1:]
|
|
):
|
|
cpu_mem_inc = next_cpu_mem - cpu_mem
|
|
gpu_mem_inc = next_gpu_mem - gpu_mem
|
|
cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
|
|
memory_diff_trace.append(
|
|
MemoryState(
|
|
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
|
|
)
|
|
)
|
|
cumulative_memory_dict[frame][0] += cpu_mem_inc
|
|
cumulative_memory_dict[frame][1] += gpu_mem_inc
|
|
cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
|
|
|
|
cumulative_memory = sorted(
|
|
list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
|
|
) # order by the total CPU + GPU memory increase
|
|
cumulative_memory = list(
|
|
MemoryState(
|
|
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
|
|
)
|
|
for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
|
|
)
|
|
|
|
if ignore_released_memory:
|
|
total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
|
|
else:
|
|
total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
|
|
total_memory = Memory(total_memory)
|
|
return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory)
|
|
|
|
return None
|
|
|
|
|
|
def bytes_to_human_readable(memory_amount: int) -> str:
|
|
""" Utility to convert a number of bytes (int) in a human readable string (with units)
|
|
"""
|
|
for unit in ["B", "KB", "MB", "GB"]:
|
|
if memory_amount > -1024.0 and memory_amount < 1024.0:
|
|
return "{:.3f}{}".format(memory_amount, unit)
|
|
memory_amount /= 1024.0
|
|
return "{:.3f}TB".format(memory_amount)
|