Files
HuggingFace_transformer/src/transformers/benchmark_utils.py
Thomas Wolf 2187c49f5c CPU/GPU memory benchmarking utilities - Remove support for python 3.5 (now only 3.6+) (#3186)
* memory benchmark rss

* have both forward pass and line-by-line mem tracing

* cleaned up tracing

* refactored and cleaning up API

* no f-strings yet...

* add GPU mem logging

* fix GPU memory monitoring

* style and quality

* clean up and doc

* update with comments

* Switching to python 3.6+

* fix quality
2020-03-17 10:17:11 -04:00

342 lines
15 KiB
Python

"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
import linecache
import logging
import os
import sys
from collections import defaultdict
from typing import Iterable, List, NamedTuple, Optional, Union
from .file_utils import is_tf_available, is_torch_available
if is_torch_available():
from torch.cuda import empty_cache as torch_empty_cache
if is_tf_available():
from tensorflow.python.eager import context as tf_context
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
_is_memory_tracing_enabled = False
def is_memory_tracing_enabled():
global _is_memory_tracing_enabled
return _is_memory_tracing_enabled
class Frame(NamedTuple):
""" `Frame` is a NamedTuple used to gather the current frame state.
`Frame` has the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
"""
filename: str
module: str
line_number: int
event: str
line_text: str
class UsedMemoryState(NamedTuple):
""" `UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
"""
frame: Frame
cpu_memory: int
gpu_memory: int
class Memory(NamedTuple):
""" `Memory` NamedTuple have a single field `bytes` and
you can get a human readable string of the number of bytes by calling `__repr__`
- `byte` (integer): number of bytes,
"""
bytes: int
def __repr__(self) -> str:
return bytes_to_human_readable(self.bytes)
class MemoryState(NamedTuple):
""" `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
"""
frame: Frame
cpu: Memory
gpu: Memory
cpu_gpu: Memory
class MemorySummary(NamedTuple):
""" `MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeted memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
"""
sequential: List[MemoryState]
cumulative: List[MemoryState]
total: Memory
MemoryTrace = List[UsedMemoryState]
def start_memory_tracing(
modules_to_trace: Optional[Union[str, Iterable[str]]] = None,
modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None,
events_to_trace: str = "line",
gpus_to_trace: Optional[List[int]] = None,
) -> MemoryTrace:
""" Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
See `../../examples/benchmarks.py for a usage example.
Current memory consumption is returned using psutil and in particular is the RSS memory
"Resident Set Size” (the non-swapped physical memory the process is using).
See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
Args:
- `modules_to_trace`: (None, string, list/tuple of string)
if None, all events are recorded
if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
- `modules_not_to_trace`: (None, string, list/tuple of string)
if None, no module is avoided
if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
default to line
- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
Return:
- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
- `UsedMemoryState` are named tuples with the following fields:
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
- 'cpu_memory': CPU RSS memory state *before* executing the line
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
`Frame` has the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
"""
try:
import psutil
except (ImportError):
logger.warning(
"Psutil not installed, we won't log CPU memory usage. "
"Install psutil (pip install psutil) to use CPU memory tracing."
)
process = None
else:
process = psutil.Process(os.getpid())
try:
from py3nvml import py3nvml
py3nvml.nvmlInit()
devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
py3nvml.nvmlShutdown()
except ImportError:
logger.warning(
"py3nvml not installed, we won't log GPU memory usage. "
"Install py3nvml (pip install py3nvml) to use GPU memory tracing."
)
log_gpu = False
except (OSError, py3nvml.NVMLError):
logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
log_gpu = False
else:
log_gpu = is_torch_available() or is_tf_available()
memory_trace = []
def traceit(frame, event, args):
""" Tracing method executed before running each line in a module or sub-module
Record memory allocated in a list with debugging information
"""
global _is_memory_tracing_enabled
if not _is_memory_tracing_enabled:
return traceit
# Filter events
if events_to_trace is not None:
if isinstance(events_to_trace, str) and event != events_to_trace:
return traceit
elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
return traceit
# Filter modules
name = frame.f_globals["__name__"]
if not isinstance(name, str):
return traceit
else:
# Filter whitelist of modules to trace
if modules_to_trace is not None:
if isinstance(modules_to_trace, str) and modules_to_trace not in name:
return traceit
elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):
return traceit
# Filter blacklist of modules not to trace
if modules_not_to_trace is not None:
if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:
return traceit
elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):
return traceit
# Record current tracing state (file, location in file...)
lineno = frame.f_lineno
filename = frame.f_globals["__file__"]
if filename.endswith(".pyc") or filename.endswith(".pyo"):
filename = filename[:-1]
line = linecache.getline(filename, lineno).rstrip()
traced_state = Frame(filename, name, lineno, event, line)
# Record current memory state (rss memory) and compute difference with previous memory state
cpu_mem = 0
if process is not None:
mem = process.memory_info()
cpu_mem = mem.rss
gpu_mem = 0
if log_gpu:
# Clear GPU caches
if is_torch_available():
torch_empty_cache()
if is_tf_available():
tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
# Sum used memory for all GPUs
py3nvml.nvmlInit()
for i in devices:
handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
gpu_mem += meminfo.used
py3nvml.nvmlShutdown()
mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
memory_trace.append(mem_state)
return traceit
sys.settrace(traceit)
global _is_memory_tracing_enabled
_is_memory_tracing_enabled = True
return memory_trace
def stop_memory_tracing(
memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
) -> Optional[MemorySummary]:
""" Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
Args:
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
Return:
- None if `memory_trace` is None
- `MemorySummary` namedtuple otherwise with the fields:
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
by substracting the memory after executing each line from the memory before executing said line.
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
obtained by summing repeted memory increase for a line if it's executed several times.
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
`Memory` named tuple have fields
- `byte` (integer): number of bytes,
- `string` (string): same as human readable string (ex: "3.5MB")
`Frame` are namedtuple used to list the current frame state and have the following fields:
- 'filename' (string): Name of the file currently executed
- 'module' (string): Name of the module currently executed
- 'line_number' (int): Number of the line currently executed
- 'event' (string): Event that triggered the tracing (default will be "line")
- 'line_text' (string): Text of the line in the python script
`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
- `frame` (`Frame`): the current frame (see above)
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
"""
global _is_memory_tracing_enabled
_is_memory_tracing_enabled = False
if memory_trace is not None and len(memory_trace) > 1:
memory_diff_trace = []
cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip(
memory_trace[:-1], memory_trace[1:]
):
cpu_mem_inc = next_cpu_mem - cpu_mem
gpu_mem_inc = next_gpu_mem - gpu_mem
cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
memory_diff_trace.append(
MemoryState(
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
)
)
cumulative_memory_dict[frame][0] += cpu_mem_inc
cumulative_memory_dict[frame][1] += gpu_mem_inc
cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
cumulative_memory = sorted(
list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
) # order by the total CPU + GPU memory increase
cumulative_memory = list(
MemoryState(
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
)
for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
)
if ignore_released_memory:
total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
else:
total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
total_memory = Memory(total_memory)
return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory)
return None
def bytes_to_human_readable(memory_amount: int) -> str:
""" Utility to convert a number of bytes (int) in a human readable string (with units)
"""
for unit in ["B", "KB", "MB", "GB"]:
if memory_amount > -1024.0 and memory_amount < 1024.0:
return "{:.3f}{}".format(memory_amount, unit)
memory_amount /= 1024.0
return "{:.3f}TB".format(memory_amount)