memmap.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. from __future__ import division, absolute_import, print_function
  2. import numpy as np
  3. from .numeric import uint8, ndarray, dtype
  4. from numpy.compat import (
  5. long, basestring, os_fspath, contextlib_nullcontext, is_pathlib_path
  6. )
  7. from numpy.core.overrides import set_module
  8. __all__ = ['memmap']
  9. dtypedescr = dtype
  10. valid_filemodes = ["r", "c", "r+", "w+"]
  11. writeable_filemodes = ["r+", "w+"]
  12. mode_equivalents = {
  13. "readonly":"r",
  14. "copyonwrite":"c",
  15. "readwrite":"r+",
  16. "write":"w+"
  17. }
  18. @set_module('numpy')
  19. class memmap(ndarray):
  20. """Create a memory-map to an array stored in a *binary* file on disk.
  21. Memory-mapped files are used for accessing small segments of large files
  22. on disk, without reading the entire file into memory. NumPy's
  23. memmap's are array-like objects. This differs from Python's ``mmap``
  24. module, which uses file-like objects.
  25. This subclass of ndarray has some unpleasant interactions with
  26. some operations, because it doesn't quite fit properly as a subclass.
  27. An alternative to using this subclass is to create the ``mmap``
  28. object yourself, then create an ndarray with ndarray.__new__ directly,
  29. passing the object created in its 'buffer=' parameter.
  30. This class may at some point be turned into a factory function
  31. which returns a view into an mmap buffer.
  32. Delete the memmap instance to close the memmap file.
  33. Parameters
  34. ----------
  35. filename : str, file-like object, or pathlib.Path instance
  36. The file name or file object to be used as the array data buffer.
  37. dtype : data-type, optional
  38. The data-type used to interpret the file contents.
  39. Default is `uint8`.
  40. mode : {'r+', 'r', 'w+', 'c'}, optional
  41. The file is opened in this mode:
  42. +------+-------------------------------------------------------------+
  43. | 'r' | Open existing file for reading only. |
  44. +------+-------------------------------------------------------------+
  45. | 'r+' | Open existing file for reading and writing. |
  46. +------+-------------------------------------------------------------+
  47. | 'w+' | Create or overwrite existing file for reading and writing. |
  48. +------+-------------------------------------------------------------+
  49. | 'c' | Copy-on-write: assignments affect data in memory, but |
  50. | | changes are not saved to disk. The file on disk is |
  51. | | read-only. |
  52. +------+-------------------------------------------------------------+
  53. Default is 'r+'.
  54. offset : int, optional
  55. In the file, array data starts at this offset. Since `offset` is
  56. measured in bytes, it should normally be a multiple of the byte-size
  57. of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of
  58. file are valid; The file will be extended to accommodate the
  59. additional data. By default, ``memmap`` will start at the beginning of
  60. the file, even if ``filename`` is a file pointer ``fp`` and
  61. ``fp.tell() != 0``.
  62. shape : tuple, optional
  63. The desired shape of the array. If ``mode == 'r'`` and the number
  64. of remaining bytes after `offset` is not a multiple of the byte-size
  65. of `dtype`, you must specify `shape`. By default, the returned array
  66. will be 1-D with the number of elements determined by file size
  67. and data-type.
  68. order : {'C', 'F'}, optional
  69. Specify the order of the ndarray memory layout:
  70. :term:`row-major`, C-style or :term:`column-major`,
  71. Fortran-style. This only has an effect if the shape is
  72. greater than 1-D. The default order is 'C'.
  73. Attributes
  74. ----------
  75. filename : str or pathlib.Path instance
  76. Path to the mapped file.
  77. offset : int
  78. Offset position in the file.
  79. mode : str
  80. File mode.
  81. Methods
  82. -------
  83. flush
  84. Flush any changes in memory to file on disk.
  85. When you delete a memmap object, flush is called first to write
  86. changes to disk before removing the object.
  87. See also
  88. --------
  89. lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
  90. Notes
  91. -----
  92. The memmap object can be used anywhere an ndarray is accepted.
  93. Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
  94. ``True``.
  95. Memory-mapped files cannot be larger than 2GB on 32-bit systems.
  96. When a memmap causes a file to be created or extended beyond its
  97. current size in the filesystem, the contents of the new part are
  98. unspecified. On systems with POSIX filesystem semantics, the extended
  99. part will be filled with zero bytes.
  100. Examples
  101. --------
  102. >>> data = np.arange(12, dtype='float32')
  103. >>> data.resize((3,4))
  104. This example uses a temporary file so that doctest doesn't write
  105. files to your directory. You would use a 'normal' filename.
  106. >>> from tempfile import mkdtemp
  107. >>> import os.path as path
  108. >>> filename = path.join(mkdtemp(), 'newfile.dat')
  109. Create a memmap with dtype and shape that matches our data:
  110. >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
  111. >>> fp
  112. memmap([[0., 0., 0., 0.],
  113. [0., 0., 0., 0.],
  114. [0., 0., 0., 0.]], dtype=float32)
  115. Write data to memmap array:
  116. >>> fp[:] = data[:]
  117. >>> fp
  118. memmap([[ 0., 1., 2., 3.],
  119. [ 4., 5., 6., 7.],
  120. [ 8., 9., 10., 11.]], dtype=float32)
  121. >>> fp.filename == path.abspath(filename)
  122. True
  123. Deletion flushes memory changes to disk before removing the object:
  124. >>> del fp
  125. Load the memmap and verify data was stored:
  126. >>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  127. >>> newfp
  128. memmap([[ 0., 1., 2., 3.],
  129. [ 4., 5., 6., 7.],
  130. [ 8., 9., 10., 11.]], dtype=float32)
  131. Read-only memmap:
  132. >>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  133. >>> fpr.flags.writeable
  134. False
  135. Copy-on-write memmap:
  136. >>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
  137. >>> fpc.flags.writeable
  138. True
  139. It's possible to assign to copy-on-write array, but values are only
  140. written into the memory copy of the array, and not written to disk:
  141. >>> fpc
  142. memmap([[ 0., 1., 2., 3.],
  143. [ 4., 5., 6., 7.],
  144. [ 8., 9., 10., 11.]], dtype=float32)
  145. >>> fpc[0,:] = 0
  146. >>> fpc
  147. memmap([[ 0., 0., 0., 0.],
  148. [ 4., 5., 6., 7.],
  149. [ 8., 9., 10., 11.]], dtype=float32)
  150. File on disk is unchanged:
  151. >>> fpr
  152. memmap([[ 0., 1., 2., 3.],
  153. [ 4., 5., 6., 7.],
  154. [ 8., 9., 10., 11.]], dtype=float32)
  155. Offset into a memmap:
  156. >>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
  157. >>> fpo
  158. memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
  159. """
  160. __array_priority__ = -100.0
  161. def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
  162. shape=None, order='C'):
  163. # Import here to minimize 'import numpy' overhead
  164. import mmap
  165. import os.path
  166. try:
  167. mode = mode_equivalents[mode]
  168. except KeyError:
  169. if mode not in valid_filemodes:
  170. raise ValueError("mode must be one of %s" %
  171. (valid_filemodes + list(mode_equivalents.keys())))
  172. if mode == 'w+' and shape is None:
  173. raise ValueError("shape must be given")
  174. if hasattr(filename, 'read'):
  175. f_ctx = contextlib_nullcontext(filename)
  176. else:
  177. f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
  178. with f_ctx as fid:
  179. fid.seek(0, 2)
  180. flen = fid.tell()
  181. descr = dtypedescr(dtype)
  182. _dbytes = descr.itemsize
  183. if shape is None:
  184. bytes = flen - offset
  185. if bytes % _dbytes:
  186. raise ValueError("Size of available data is not a "
  187. "multiple of the data-type size.")
  188. size = bytes // _dbytes
  189. shape = (size,)
  190. else:
  191. if not isinstance(shape, tuple):
  192. shape = (shape,)
  193. size = np.intp(1) # avoid default choice of np.int_, which might overflow
  194. for k in shape:
  195. size *= k
  196. bytes = long(offset + size*_dbytes)
  197. if mode in ('w+', 'r+') and flen < bytes:
  198. fid.seek(bytes - 1, 0)
  199. fid.write(b'\0')
  200. fid.flush()
  201. if mode == 'c':
  202. acc = mmap.ACCESS_COPY
  203. elif mode == 'r':
  204. acc = mmap.ACCESS_READ
  205. else:
  206. acc = mmap.ACCESS_WRITE
  207. start = offset - offset % mmap.ALLOCATIONGRANULARITY
  208. bytes -= start
  209. array_offset = offset - start
  210. mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
  211. self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
  212. offset=array_offset, order=order)
  213. self._mmap = mm
  214. self.offset = offset
  215. self.mode = mode
  216. if is_pathlib_path(filename):
  217. # special case - if we were constructed with a pathlib.path,
  218. # then filename is a path object, not a string
  219. self.filename = filename.resolve()
  220. elif hasattr(fid, "name") and isinstance(fid.name, basestring):
  221. # py3 returns int for TemporaryFile().name
  222. self.filename = os.path.abspath(fid.name)
  223. # same as memmap copies (e.g. memmap + 1)
  224. else:
  225. self.filename = None
  226. return self
  227. def __array_finalize__(self, obj):
  228. if hasattr(obj, '_mmap') and np.may_share_memory(self, obj):
  229. self._mmap = obj._mmap
  230. self.filename = obj.filename
  231. self.offset = obj.offset
  232. self.mode = obj.mode
  233. else:
  234. self._mmap = None
  235. self.filename = None
  236. self.offset = None
  237. self.mode = None
  238. def flush(self):
  239. """
  240. Write any changes in the array to the file on disk.
  241. For further information, see `memmap`.
  242. Parameters
  243. ----------
  244. None
  245. See Also
  246. --------
  247. memmap
  248. """
  249. if self.base is not None and hasattr(self.base, 'flush'):
  250. self.base.flush()
  251. def __array_wrap__(self, arr, context=None):
  252. arr = super(memmap, self).__array_wrap__(arr, context)
  253. # Return a memmap if a memmap was given as the output of the
  254. # ufunc. Leave the arr class unchanged if self is not a memmap
  255. # to keep original memmap subclasses behavior
  256. if self is arr or type(self) is not memmap:
  257. return arr
  258. # Return scalar instead of 0d memmap, e.g. for np.sum with
  259. # axis=None
  260. if arr.shape == ():
  261. return arr[()]
  262. # Return ndarray otherwise
  263. return arr.view(np.ndarray)
  264. def __getitem__(self, index):
  265. res = super(memmap, self).__getitem__(index)
  266. if type(res) is memmap and res._mmap is None:
  267. return res.view(type=ndarray)
  268. return res