Source code for pfio.cache.multiprocess_file_cache

import errno
import fcntl
import numbers
import os
import pickle
import tempfile
import warnings
from struct import calcsize, pack, unpack

from pfio import cache
from pfio.cache.file_cache import _check_local, _default_cache_path


class _NoOpenNamedTemporaryFile(object):
    """Temporary file class

    This class warps mkstemp and implements auto-clean mechanism.
    The reason why we cannot use the tempfile.NamedTemporaryFile is that
    it has an unpicklable member because it opens the created temporary file,
    which makes it impossible to pass over to worker processes.

    The auto cleanup mechanism is based on CPython tempfile implementation.
    https://github.com/python/cpython/blob/3.8/Lib/tempfile.py#L406-L446
    """

    # Set here since __del__ checks it
    name = None
    master_pid = None

    def __init__(self, dir, master_pid):
        _, self.name = tempfile.mkstemp(dir=dir)
        self.master_pid = master_pid

    def close(self, unlink=os.unlink, getpid=os.getpid):
        if self.name and self.master_pid == getpid():
            unlink(self.name)
            self.name = None

    def __del__(self):
        self.close()


class _DummyTemporaryFile(object):
    """Dummy tempfile class that imitates the _NoOpenNamedTemporaryFile

    This class is used for MultiprocessFileCache.preload.
    The cache file fed from outside shouldn't be automatically deleted
    by close(), so it uses this dummy cache class.
    """

    def __init__(self, name):
        self.name = name

    def close(self):
        pass


[docs] class MultiprocessFileCache(cache.Cache): '''The Multiprocess-safe cache system on a local filesystem Stores cache data in a local temporary file, created in ``~/.pfio/cache`` by default. It automatically deletes the cache data after the object is collected. When this object is not correctly closed (e.g., the process killed by SIGKILL), the cache remains after the process's death. This class supports handling a cache from multiple processes. A MultiprocessFileCache object can be handed over to another process through the pickle. Calling ``get`` and ``put`` in each process will look into the same cache file with flock-based locking. The temporary cache file will persist as long as the MultiprocessFileCache object is alive in the original process that creates it. Therefore, even after destroying the worker processes, the MultiprocessFileCache object can still be passed to another process. .. admonition:: Example Using MultiprocessFileCache is similar to the :class:`~NaiveCache` and :class:`~FileCache`. :: from pfio.cache import MultiprocessFileCache class MyDataset(torch.utils.data.Dataset): def __init__(self, image_paths): self.paths = image_paths self.cache = MultiprocessFileCache(len(image_paths), do_pickle=True) ... When iterating over the dataset, it is common to load the data concurrently to hide file IO bottleneck by setting higher ``num_workers`` in PyTorch DataLoader. https://pytorch.org/docs/stable/data.html :: image_paths = open('/path/to/image_list.txt').read().splitlines() dataset = MyDataset(image_paths) loader = DataLoader(dataset, batch_size=64, num_workers=8) # Parallel data loading for epoch in range(10): for batch in loader: ... In this case, the dataset is distributed to each worker process i.e., ``__getitem__`` of the dataset will be called in a different process that initialized it. The ``MultiprocessFileCache`` object held by the dataset in each worker looks at the same cache file and handles the concurrent access based on the ``flock`` system call. Therefore the data inserted to the cache by a worker process can be accessed from another worker process safely. In case your task does not require concurrent data loading, i.e., ``num_workers=0`` in DataLoader, consider using :class:`~FileCache` as it has less overhead for concurrency control. The persisted cache file created by ``preserve()`` can be used for :meth:`FileCache.preload` and vice versa. Arguments: length (int): Length of the cache array. do_pickle (bool): Do automatic pickle and unpickle inside the cache. dir (str): The path to the directory to place cache data in case home directory is not backed by fast storage device. Must not be an NFS. cache_size_limit (None or int): Limitation of the cache size in bytes. If the total amount of cached data reaches the limit, the cache will become frozen and no longer acccept further addition. Data already stored in the cache can be accessed normally. None (default) and 0 is unlimited. verbose (bool): Print detailed logs of the cache. ''' # NOQA def __init__(self, length, do_pickle=False, dir=None, cache_size_limit=None, verbose=False): self.length = length self.do_pickle = do_pickle self.verbose = verbose if self.length <= 0 or (2 ** 64) <= self.length: raise ValueError("length has to be between 0 and 2^64") if not (cache_size_limit is None or (isinstance(cache_size_limit, numbers.Number) and 0 <= cache_size_limit)): msg = "cache_size_limit has to be either None, zero " \ "(both indicate unlimited) or larger than 0. " \ "{} is specified.".format(cache_size_limit) raise ValueError(msg) self.cache_size_limit = cache_size_limit if dir is None: self.dir = _default_cache_path() else: self.dir = dir os.makedirs(self.dir, exist_ok=True) _check_local(self.dir) self.closed = False self._frozen = False self._master_pid = os.getpid() self.cache_file = _NoOpenNamedTemporaryFile(self.dir, self._master_pid) cache_fd = os.open(self.cache_file.name, os.O_RDWR) if self.verbose: print('created cache file:', self.cache_file.name) try: fcntl.flock(cache_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) # Fill up indices part of the cache file by index=0, size=-1 buf = pack('Qq', 0, -1) self.buflen = calcsize('Qq') assert self.buflen == 16 for i in range(self.length): offset = self.buflen * i r = os.pwrite(cache_fd, buf, offset) assert r == self.buflen except OSError as ose: # Lock acquisition error -> No problem, since other worker # should be already working on it if ose.errno not in (errno.EACCES, errno.EAGAIN): raise finally: fcntl.flock(cache_fd, fcntl.LOCK_UN) os.close(cache_fd) # Open lazily at the first call of get or put in each child process self._fd_pid = None self.cache_fd = None def __len__(self): return self.length @property def multiprocess_safe(self) -> bool: return True @property def multithread_safe(self) -> bool: return True def get(self, i): if self.closed: return data = self._get(i) if self.do_pickle and data: data = pickle.loads(data) return data def _open_fds(self): pid = os.getpid() if self._fd_pid != pid: self._fd_pid = pid self.cache_fd = os.open(self.cache_file.name, os.O_RDWR) def _get(self, i): if i < 0 or self.length <= i: raise IndexError("index {} out of range ([0, {}])" .format(i, self.length - 1)) self._open_fds() offset = self.buflen * i fcntl.flock(self.cache_fd, fcntl.LOCK_SH) index_entry = os.pread(self.cache_fd, self.buflen, offset) (o, l) = unpack('Qq', index_entry) if l < 0 or o < 0: fcntl.flock(self.cache_fd, fcntl.LOCK_UN) return None data = os.pread(self.cache_fd, l, o) assert len(data) == l fcntl.flock(self.cache_fd, fcntl.LOCK_UN) return data def put(self, i, data): if self._frozen or self.closed: return False try: if self.do_pickle: data = pickle.dumps(data) return self._put(i, data) except OSError as ose: # Disk full (ENOSPC) possibly by cache; just warn and keep running if ose.errno == errno.ENOSPC: warnings.warn(ose.strerror, RuntimeWarning) return False else: raise ose def _put(self, i, data): if self.closed: return False if i < 0 or self.length <= i: raise IndexError("index {} out of range ([0, {}])" .format(i, self.length - 1)) self._open_fds() index_ofst = self.buflen * i fcntl.flock(self.cache_fd, fcntl.LOCK_EX) buf = os.pread(self.cache_fd, self.buflen, index_ofst) (o, l) = unpack('Qq', buf) if l >= 0 and o >= 0: # Already data exists fcntl.flock(self.cache_fd, fcntl.LOCK_UN) return False data_pos = os.lseek(self.cache_fd, 0, os.SEEK_END) if self.cache_size_limit: if self.cache_size_limit < (data_pos + len(data)): self._frozen = True fcntl.flock(self.cache_fd, fcntl.LOCK_UN) return False index_entry = pack('Qq', data_pos, len(data)) assert os.pwrite(self.cache_fd, index_entry, index_ofst) == self.buflen assert os.pwrite(self.cache_fd, data, data_pos) == len(data) os.fsync(self.cache_fd) fcntl.flock(self.cache_fd, fcntl.LOCK_UN) return True def __enter__(self): return self def __exit__(self, *exc): self.close() def close(self): pid = os.getpid() if pid == self._fd_pid: os.close(self.cache_fd) self._fd_pid = None if not self.closed and pid == self._master_pid: self.cache_file.close() self.closed = True self.cache_file = None self.cache_fd = None
[docs] def preload(self, name): '''Load the cache saved by ``preserve()`` After loading the file, no data can be added to the cache. ``name`` is the name of the persistent file in the cache directory. When it succeeds, it returns ``True``. If there is no cache file with the specified name in the cache directory, it will do nothing but return ``False``. Be noted that ``preload()`` can be called only by the master process i.e., the process where ``__init__()`` is called, in order to prevent inconsistency. When using in a multiprocessing environment, you first need to create a ``MultiprocessFileCache`` object, call its ``preload()`` and then pass it to the worker processes. Returns: bool: Returns True if succeed. .. note:: This feature is experimental. ''' if self._frozen: if self.verbose: print("Failed to preload the cache from {}: " "The cache is already frozen." .format(name)) return False if self._master_pid != os.getpid(): raise RuntimeError("Cannot preload a cache in a worker process") # Overwrite the current cache by the specified cache file. # This is needed to prevent the specified cache file are deleted when # the cache object is destroyed. ld_cache_file = os.path.join(self.dir, name) if not os.path.exists(ld_cache_file): if self.verbose: print('Failed to ploread the cache from {}: ' 'The specified cache not found in {}' .format(name, self.dir)) return False self.cache_file.close() self.cache_fd = None self.cache_file = _DummyTemporaryFile(ld_cache_file) self._frozen = True return True
[docs] def preserve(self, name, overwrite=False): '''Preserve the cache as a persistent file on the disk Once the cache is preserved, the cache file will not be removed at cache close. To read data from preserved file, use ``preload()`` method. After preservation, no data can be added to the cache. ``name`` is the name of the persistent files saved into the cache directory. When it succeeds, it returns ``True``. If there is a cache file with the same name already exists in the cache directory, it will do nothing but return ``False``. Be noted that ``preserve()`` can be called only by the master process i.e., the process where ``__init__()`` is called, in order to prevent inconsistency. The preserved cache can also be preloaded by :class:`~FileCache`. Arguments: name (str): Prefix of the preserved file names. ``(name).cachei`` and ``(name).cached`` are created. The files are created in the same directory as the cache (``dir`` option to ``__init__``). overwrite (bool): Overwrite if already exists Returns: bool: Returns True if succeed. .. note:: This feature is experimental. ''' if self._master_pid != os.getpid(): raise RuntimeError("Cannot preserve a cache in a worker process") cache_file = os.path.join(self.dir, name) if overwrite: if os.path.exists(cache_file): os.unlink(cache_file) elif os.path.exists(cache_file): if self.verbose: print('Specified cache named "{}" already exists in {}' .format(name, self.dir)) return False self._open_fds() try: fcntl.flock(self.cache_fd, fcntl.LOCK_EX) os.link(self.cache_file.name, cache_file) except OSError as ose: # Lock acquisition error -> No problem, since other worker # should be already working on it if ose.errno not in (errno.EACCES, errno.EAGAIN): raise finally: fcntl.flock(self.cache_fd, fcntl.LOCK_UN) return True