Import tensorflow

This commit is contained in:
2026-02-15 21:45:42 -08:00
parent f3e8b90764
commit c530630153
20524 changed files with 9017694 additions and 25 deletions
@@ -0,0 +1,15 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
This subpackage implements the high-level interface for h5py.
Don't manually import things from here; the public API lives directly
in the top-level package namespace.
"""
@@ -0,0 +1,277 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements high-level operations for attributes.
Provides the AttributeManager class, available on high-level objects
as <obj>.attrs.
"""
import numpy
from .. import h5, h5s, h5t, h5a, h5p
from . import base
from .base import phil, with_phil, Empty, is_empty_dataspace, product
from .datatype import Datatype
class AttributeManager(base.MutableMappingHDF5, base.CommonStateObject):
"""
Allows dictionary-style access to an HDF5 object's attributes.
These are created exclusively by the library and are available as
a Python attribute at <object>.attrs
Like Group objects, attributes provide a minimal dictionary-
style interface. Anything which can be reasonably converted to a
Numpy array or Numpy scalar can be stored.
Attributes are automatically created on assignment with the
syntax <obj>.attrs[name] = value, with the HDF5 type automatically
deduced from the value. Existing attributes are overwritten.
To modify an existing attribute while preserving its type, use the
method modify(). To specify an attribute of a particular type and
shape, use create().
"""
def __init__(self, parent):
""" Private constructor.
"""
self._id = parent.id
@with_phil
def __getitem__(self, name):
""" Read the value of an attribute.
"""
attr = h5a.open(self._id, self._e(name))
shape = attr.shape
# shape is None for empty dataspaces
if shape is None:
return Empty(attr.dtype)
dtype = attr.dtype
# Do this first, as we'll be fiddling with the dtype for top-level
# array types
htype = h5t.py_create(dtype)
# NumPy doesn't support top-level array types, so we have to "fake"
# the correct type and shape for the array. For example, consider
# attr.shape == (5,) and attr.dtype == '(3,)f'. Then:
if dtype.subdtype is not None:
subdtype, subshape = dtype.subdtype
shape = attr.shape + subshape # (5, 3)
dtype = subdtype # 'f'
arr = numpy.zeros(shape, dtype=dtype, order='C')
attr.read(arr, mtype=htype)
string_info = h5t.check_string_dtype(dtype)
if string_info and (string_info.length is None):
# Vlen strings: convert bytes to Python str
arr = numpy.array([
b.decode('utf-8', 'surrogateescape') for b in arr.flat
], dtype=dtype).reshape(arr.shape)
if arr.ndim == 0:
return arr[()]
return arr
def get_id(self, name):
"""Get a low-level AttrID object for the named attribute.
"""
return h5a.open(self._id, self._e(name))
@with_phil
def __setitem__(self, name, value):
""" Set a new attribute, overwriting any existing attribute.
The type and shape of the attribute are determined from the data. To
use a specific type or shape, or to preserve the type of an attribute,
use the methods create() and modify().
"""
self.create(name, data=value)
@with_phil
def __delitem__(self, name):
""" Delete an attribute (which must already exist). """
h5a.delete(self._id, self._e(name))
def create(self, name, data, shape=None, dtype=None):
""" Create a new attribute, overwriting any existing attribute.
name
Name of the new attribute (required)
data
An array to initialize the attribute (required)
shape
Shape of the attribute. Overrides data.shape if both are
given, in which case the total number of points must be unchanged.
dtype
Data type of the attribute. Overrides data.dtype if both
are given.
"""
name = self._e(name)
with phil:
# First, make sure we have a NumPy array. We leave the data type
# conversion for HDF5 to perform.
if not isinstance(data, Empty):
data = base.array_for_new_object(data, specified_dtype=dtype)
if shape is None:
shape = data.shape
elif isinstance(shape, int):
shape = (shape,)
use_htype = None # If a committed type is given, we must use it
# in the call to h5a.create.
if isinstance(dtype, Datatype):
use_htype = dtype.id
dtype = dtype.dtype
elif dtype is None:
dtype = data.dtype
else:
dtype = numpy.dtype(dtype) # In case a string, e.g. 'i8' is passed
original_dtype = dtype # We'll need this for top-level array types
# Where a top-level array type is requested, we have to do some
# fiddling around to present the data as a smaller array of
# subarrays.
if dtype.subdtype is not None:
subdtype, subshape = dtype.subdtype
# Make sure the subshape matches the last N axes' sizes.
if shape[-len(subshape):] != subshape:
raise ValueError("Array dtype shape %s is incompatible with data shape %s" % (subshape, shape))
# New "advertised" shape and dtype
shape = shape[0:len(shape)-len(subshape)]
dtype = subdtype
# Not an array type; make sure to check the number of elements
# is compatible, and reshape if needed.
else:
if shape is not None and product(shape) != product(data.shape):
raise ValueError("Shape of new attribute conflicts with shape of data")
if shape != data.shape:
data = data.reshape(shape)
# We need this to handle special string types.
if not isinstance(data, Empty):
data = numpy.asarray(data, dtype=dtype)
# Make HDF5 datatype and dataspace for the H5A calls
if use_htype is None:
htype = h5t.py_create(original_dtype, logical=True)
htype2 = h5t.py_create(original_dtype) # Must be bit-for-bit representation rather than logical
else:
htype = use_htype
htype2 = None
if isinstance(data, Empty):
space = h5s.create(h5s.NULL)
else:
space = h5s.create_simple(shape)
# For a long time, h5py would create attributes with a random name
# and then rename them, imitating how you can atomically replace
# a file in a filesystem. But HDF5 does not offer atomic replacement
# (you have to delete the existing attribute first), and renaming
# exposes some bugs - see https://github.com/h5py/h5py/issues/1385
# So we've gone back to the simpler delete & recreate model.
if h5a.exists(self._id, name):
h5a.delete(self._id, name)
attr = h5a.create(self._id, name, htype, space)
try:
if not isinstance(data, Empty):
attr.write(data, mtype=htype2)
except:
attr.close()
h5a.delete(self._id, name)
raise
attr.close()
def modify(self, name, value):
""" Change the value of an attribute while preserving its type.
Differs from __setitem__ in that if the attribute already exists, its
type is preserved. This can be very useful for interacting with
externally generated files.
If the attribute doesn't exist, it will be automatically created.
"""
with phil:
if not name in self:
self[name] = value
else:
attr = h5a.open(self._id, self._e(name))
if is_empty_dataspace(attr):
raise OSError("Empty attributes can't be modified")
# If the input data is already an array, let HDF5 do the conversion.
# If it's a list or similar, don't make numpy guess a dtype for it.
dt = None if isinstance(value, numpy.ndarray) else attr.dtype
value = numpy.asarray(value, order='C', dtype=dt)
# Allow the case of () <-> (1,)
if (value.shape != attr.shape) and not \
(value.size == 1 and product(attr.shape) == 1):
raise TypeError("Shape of data is incompatible with existing attribute")
attr.write(value)
@with_phil
def __len__(self):
""" Number of attributes attached to the object. """
# I expect we will not have more than 2**32 attributes
return h5a.get_num_attrs(self._id)
def __iter__(self):
""" Iterate over the names of attributes. """
with phil:
attrlist = []
def iter_cb(name, *args):
""" Callback to gather attribute names """
attrlist.append(self._d(name))
cpl = self._id.get_create_plist()
crt_order = cpl.get_attr_creation_order()
cpl.close()
if crt_order & h5p.CRT_ORDER_TRACKED:
idx_type = h5.INDEX_CRT_ORDER
else:
idx_type = h5.INDEX_NAME
h5a.iterate(self._id, iter_cb, index_type=idx_type)
for name in attrlist:
yield name
@with_phil
def __contains__(self, name):
""" Determine if an attribute exists, by name. """
return h5a.exists(self._id, self._e(name))
@with_phil
def __repr__(self):
if not self._id:
return "<Attributes of closed HDF5 object>"
return "<Attributes of HDF5 object at %s>" % id(self._id)
@@ -0,0 +1,535 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements operations common to all high-level objects (File, etc.).
"""
from collections.abc import (
Mapping, MutableMapping, KeysView, ValuesView, ItemsView
)
import os
import posixpath
import numpy as np
# The high-level interface is serialized; every public API function & method
# is wrapped in a lock. We reuse the low-level lock because (1) it's fast,
# and (2) it eliminates the possibility of deadlocks due to out-of-order
# lock acquisition.
from .._objects import phil, with_phil
from .. import h5d, h5i, h5r, h5p, h5f, h5t, h5s
from .compat import filename_encode
def is_hdf5(fname):
""" Determine if a file is valid HDF5 (False if it doesn't exist). """
with phil:
fname = os.path.abspath(os.fspath(fname))
if os.path.isfile(fname):
return h5f.is_hdf5(filename_encode(fname))
return False
def find_item_type(data):
"""Find the item type of a simple object or collection of objects.
E.g. [[['a']]] -> str
The focus is on collections where all items have the same type; we'll return
None if that's not the case.
The aim is to treat numpy arrays of Python objects like normal Python
collections, while treating arrays with specific dtypes differently.
We're also only interested in array-like collections - lists and tuples,
possibly nested - not things like sets or dicts.
"""
if isinstance(data, np.ndarray):
if (
data.dtype.kind == 'O'
and not h5t.check_string_dtype(data.dtype)
and not h5t.check_vlen_dtype(data.dtype)
):
item_types = {type(e) for e in data.flat}
else:
return None
elif isinstance(data, (list, tuple)):
item_types = {find_item_type(e) for e in data}
else:
return type(data)
if len(item_types) != 1:
return None
return item_types.pop()
def guess_dtype(data):
""" Attempt to guess an appropriate dtype for the object, returning None
if nothing is appropriate (or if it should be left up the the array
constructor to figure out)
"""
with phil:
if isinstance(data, h5r.RegionReference):
return h5t.regionref_dtype
if isinstance(data, h5r.Reference):
return h5t.ref_dtype
item_type = find_item_type(data)
if item_type is bytes:
return h5t.string_dtype(encoding='ascii')
if item_type is str:
return h5t.string_dtype()
return None
def is_float16_dtype(dt):
if dt is None:
return False
dt = np.dtype(dt) # normalize strings -> np.dtype objects
return dt.kind == 'f' and dt.itemsize == 2
def array_for_new_object(data, specified_dtype=None):
"""Prepare an array from data used to create a new dataset or attribute"""
if not isinstance(specified_dtype, (np.dtype, type(None))):
specified_dtype = np.dtype(specified_dtype)
# We mostly let HDF5 convert data as necessary when it's written.
# But if we are going to a float16 datatype, pre-convert in python
# to workaround a bug in the conversion.
# https://github.com/h5py/h5py/issues/819
if is_float16_dtype(specified_dtype):
as_dtype = specified_dtype
elif not isinstance(data, np.ndarray) and (specified_dtype is not None):
# If we need to convert e.g. a list to an array, don't leave numpy
# to guess a dtype we already know.
as_dtype = specified_dtype
else:
as_dtype = guess_dtype(data)
data = np.asarray(data, order="C", dtype=as_dtype)
# In most cases, this does nothing. But if data was already an array,
# and as_dtype is a tagged h5py dtype (e.g. for an object array of strings),
# asarray() doesn't replace its dtype object. This gives it the tagged dtype:
if as_dtype is not None:
data = data.view(dtype=as_dtype)
return data
def default_lapl():
""" Default link access property list """
return None
def default_lcpl():
""" Default link creation property list """
lcpl = h5p.create(h5p.LINK_CREATE)
lcpl.set_create_intermediate_group(True)
return lcpl
dlapl = default_lapl()
dlcpl = default_lcpl()
def is_empty_dataspace(obj):
""" Check if an object's dataspace is empty """
if obj.get_space().get_simple_extent_type() == h5s.NULL:
return True
return False
class CommonStateObject:
"""
Mixin class that allows sharing information between objects which
reside in the same HDF5 file. Requires that the host class have
a ".id" attribute which returns a low-level ObjectID subclass.
Also implements Unicode operations.
"""
@property
def _lapl(self):
""" Fetch the link access property list appropriate for this object
"""
return dlapl
@property
def _lcpl(self):
""" Fetch the link creation property list appropriate for this object
"""
return dlcpl
def _e(self, name, lcpl=None):
""" Encode a name according to the current file settings.
Returns name, or 2-tuple (name, lcpl) if lcpl is True
- Binary strings are always passed as-is, h5t.CSET_ASCII
- Unicode strings are encoded utf8, h5t.CSET_UTF8
If name is None, returns either None or (None, None) appropriately.
"""
def get_lcpl(coding):
""" Create an appropriate link creation property list """
lcpl = self._lcpl.copy()
lcpl.set_char_encoding(coding)
return lcpl
if name is None:
return (None, None) if lcpl else None
if isinstance(name, bytes):
coding = h5t.CSET_ASCII
elif isinstance(name, str):
try:
name = name.encode('ascii')
coding = h5t.CSET_ASCII
except UnicodeEncodeError:
name = name.encode('utf8')
coding = h5t.CSET_UTF8
else:
raise TypeError(f"A name should be string or bytes, not {type(name)}")
if lcpl:
return name, get_lcpl(coding)
return name
def _d(self, name):
""" Decode a name according to the current file settings.
- Try to decode utf8
- Failing that, return the byte string
If name is None, returns None.
"""
if name is None:
return None
try:
return name.decode('utf8')
except UnicodeDecodeError:
pass
return name
class _RegionProxy:
"""
Proxy object which handles region references.
To create a new region reference (datasets only), use slicing syntax:
>>> newref = obj.regionref[0:10:2]
To determine the target dataset shape from an existing reference:
>>> shape = obj.regionref.shape(existingref)
where <obj> may be any object in the file. To determine the shape of
the selection in use on the target dataset:
>>> selection_shape = obj.regionref.selection(existingref)
"""
def __init__(self, obj):
self.obj = obj
self.id = obj.id
def __getitem__(self, args):
if not isinstance(self.id, h5d.DatasetID):
raise TypeError("Region references can only be made to datasets")
from . import selections
with phil:
selection = selections.select(self.id.shape, args, dataset=self.obj)
return h5r.create(self.id, b'.', h5r.DATASET_REGION, selection.id)
def shape(self, ref):
""" Get the shape of the target dataspace referred to by *ref*. """
with phil:
sid = h5r.get_region(ref, self.id)
return sid.shape
def selection(self, ref):
""" Get the shape of the target dataspace selection referred to by *ref*
"""
from . import selections
with phil:
sid = h5r.get_region(ref, self.id)
return selections.guess_shape(sid)
class HLObject(CommonStateObject):
"""
Base class for high-level interface objects.
"""
@property
def file(self):
""" Return a File instance associated with this object """
from . import files
with phil:
return files.File(self.id)
@property
@with_phil
def name(self):
""" Return the full name of this object. None if anonymous. """
return self._d(h5i.get_name(self.id))
@property
@with_phil
def parent(self):
"""Return the parent group of this object.
This is always equivalent to obj.file[posixpath.dirname(obj.name)].
ValueError if this object is anonymous.
"""
if self.name is None:
raise ValueError("Parent of an anonymous object is undefined")
return self.file[posixpath.dirname(self.name)]
@property
@with_phil
def id(self):
""" Low-level identifier appropriate for this object """
return self._id
@property
@with_phil
def ref(self):
""" An (opaque) HDF5 reference to this object """
return h5r.create(self.id, b'.', h5r.OBJECT)
@property
@with_phil
def regionref(self):
"""Create a region reference (Datasets only).
The syntax is regionref[<slices>]. For example, dset.regionref[...]
creates a region reference in which the whole dataset is selected.
Can also be used to determine the shape of the referenced dataset
(via .shape property), or the shape of the selection (via the
.selection property).
"""
return _RegionProxy(self)
@property
def attrs(self):
""" Attributes attached to this object """
from . import attrs
with phil:
return attrs.AttributeManager(self)
@with_phil
def __init__(self, oid):
""" Setup this object, given its low-level identifier """
self._id = oid
@with_phil
def __hash__(self):
return hash(self.id)
@with_phil
def __eq__(self, other):
if hasattr(other, 'id'):
return self.id == other.id
return NotImplemented
def __bool__(self):
with phil:
return bool(self.id)
def __getnewargs__(self):
"""Disable pickle.
Handles for HDF5 objects can't be reliably deserialised, because the
recipient may not have access to the same files. So we do this to
fail early.
If you really want to pickle h5py objects and can live with some
limitations, look at the h5pickle project on PyPI.
"""
raise TypeError("h5py objects cannot be pickled")
def __getstate__(self):
# Pickle protocols 0 and 1 use this instead of __getnewargs__
raise TypeError("h5py objects cannot be pickled")
# --- Dictionary-style interface ----------------------------------------------
# To implement the dictionary-style interface from groups and attributes,
# we inherit from the appropriate abstract base classes in collections.
#
# All locking is taken care of by the subclasses.
# We have to override ValuesView and ItemsView here because Group and
# AttributeManager can only test for key names.
class KeysViewHDF5(KeysView):
def __str__(self):
return "<KeysViewHDF5 {}>".format(list(self))
def __reversed__(self):
yield from reversed(self._mapping)
__repr__ = __str__
class ValuesViewHDF5(ValuesView):
"""
Wraps e.g. a Group or AttributeManager to provide a value view.
Note that __contains__ will have poor performance as it has
to scan all the links or attributes.
"""
def __contains__(self, value):
with phil:
for key in self._mapping:
if value == self._mapping.get(key):
return True
return False
def __iter__(self):
with phil:
for key in self._mapping:
yield self._mapping.get(key)
def __reversed__(self):
with phil:
for key in reversed(self._mapping):
yield self._mapping.get(key)
class ItemsViewHDF5(ItemsView):
"""
Wraps e.g. a Group or AttributeManager to provide an items view.
"""
def __contains__(self, item):
with phil:
key, val = item
if key in self._mapping:
return val == self._mapping.get(key)
return False
def __iter__(self):
with phil:
for key in self._mapping:
yield (key, self._mapping.get(key))
def __reversed__(self):
with phil:
for key in reversed(self._mapping):
yield (key, self._mapping.get(key))
class MappingHDF5(Mapping):
"""
Wraps a Group, AttributeManager or DimensionManager object to provide
an immutable mapping interface.
We don't inherit directly from MutableMapping because certain
subclasses, for example DimensionManager, are read-only.
"""
def keys(self):
""" Get a view object on member names """
return KeysViewHDF5(self)
def values(self):
""" Get a view object on member objects """
return ValuesViewHDF5(self)
def items(self):
""" Get a view object on member items """
return ItemsViewHDF5(self)
def _ipython_key_completions_(self):
""" Custom tab completions for __getitem__ in IPython >=5.0. """
return sorted(self.keys())
class MutableMappingHDF5(MappingHDF5, MutableMapping):
"""
Wraps a Group or AttributeManager object to provide a mutable
mapping interface, in contrast to the read-only mapping of
MappingHDF5.
"""
pass
class Empty:
"""
Proxy object to represent empty/null dataspaces (a.k.a H5S_NULL).
This can have an associated dtype, but has no shape or data. This is not
the same as an array with shape (0,).
"""
shape = None
size = None
def __init__(self, dtype):
self.dtype = np.dtype(dtype)
def __eq__(self, other):
if isinstance(other, Empty) and self.dtype == other.dtype:
return True
return False
def __repr__(self):
return "Empty(dtype={0!r})".format(self.dtype)
def product(nums):
"""Calculate a numeric product
For small amounts of data (e.g. shape tuples), this simple code is much
faster than calling numpy.prod().
"""
prod = 1
for n in nums:
prod *= n
return prod
# Simple variant of cached_property:
# Unlike functools, this has no locking, so we don't have to worry about
# deadlocks with phil (see issue gh-2064). Unlike cached-property on PyPI, it
# doesn't try to import asyncio (which can be ~100 extra modules).
# Many projects seem to have similar variants of this, often without attribution,
# but to be cautious, this code comes from cached-property (Copyright (c) 2015,
# Daniel Greenfeld, BSD license), where it is attributed to bottle (Copyright
# (c) 2009-2022, Marcel Hellkamp, MIT license).
class cached_property:
def __init__(self, func):
self.__doc__ = getattr(func, "__doc__")
self.func = func
def __get__(self, obj, cls):
if obj is None:
return self
value = obj.__dict__[self.func.__name__] = self.func(obj)
return value
@@ -0,0 +1,46 @@
"""
Compatibility module for high-level h5py
"""
import os
import sys
from ..version import hdf5_built_version_tuple
# HDF5 supported passing paths as UTF-8 for Windows from 1.10.6, but this
# was broken again in 1.14.4 - https://github.com/HDFGroup/hdf5/issues/5037 .
# The change was reverted in 1.14.6.
if (1, 14, 4) <= hdf5_built_version_tuple < (1, 14, 6):
WINDOWS_ENCODING = "mbcs"
else:
WINDOWS_ENCODING = "utf-8"
def filename_encode(filename):
"""
Encode filename for use in the HDF5 library.
Due to how HDF5 handles filenames on different systems, this should be
called on any filenames passed to the HDF5 library. See the documentation on
filenames in h5py for more information.
"""
filename = os.fspath(filename)
if sys.platform == "win32" and isinstance(filename, str):
return filename.encode(WINDOWS_ENCODING, "strict")
else:
return os.fsencode(filename)
def filename_decode(filename):
"""
Decode filename used by HDF5 library.
Due to how HDF5 handles filenames on different systems, this should be
called on any filenames passed from the HDF5 library. See the documentation
on filenames in h5py for more information.
"""
if not isinstance(filename, (str, bytes)):
raise TypeError(f"expect bytes or str, not {type(filename).__name__}")
if sys.platform == "win32" and isinstance(filename, bytes):
return filename.decode(WINDOWS_ENCODING, "strict")
else:
return os.fsdecode(filename)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,55 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements high-level access to committed datatypes in the file.
"""
import posixpath as pp
from ..h5t import TypeID
from .base import HLObject, with_phil
class Datatype(HLObject):
"""
Represents an HDF5 named datatype stored in a file.
To store a datatype, simply assign it to a name in a group:
>>> MyGroup["name"] = numpy.dtype("f")
>>> named_type = MyGroup["name"]
>>> assert named_type.dtype == numpy.dtype("f")
"""
@property
@with_phil
def dtype(self):
"""Numpy dtype equivalent for this datatype"""
return self.id.dtype
@with_phil
def __init__(self, bind):
""" Create a new Datatype object by binding to a low-level TypeID.
"""
if not isinstance(bind, TypeID):
raise ValueError("%s is not a TypeID" % bind)
super().__init__(bind)
@with_phil
def __repr__(self):
if not self.id:
return "<Closed HDF5 named type>"
if self.name is None:
namestr = '("anonymous")'
else:
name = pp.basename(pp.normpath(self.name))
namestr = '"%s"' % (name if name != '' else '/')
return '<HDF5 named type %s (dtype %s)>' % \
(namestr, self.dtype.str)
@@ -0,0 +1,181 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements support for HDF5 dimension scales.
"""
import warnings
from .. import h5ds
from ..h5py_warnings import H5pyDeprecationWarning
from . import base
from .base import phil, with_phil
from .dataset import Dataset
class DimensionProxy(base.CommonStateObject):
"""
Represents an HDF5 "dimension".
"""
@property
@with_phil
def label(self):
""" Get or set the dimension scale label """
return self._d(h5ds.get_label(self._id, self._dimension))
@label.setter
@with_phil
def label(self, val):
# pylint: disable=missing-docstring
h5ds.set_label(self._id, self._dimension, self._e(val))
@with_phil
def __init__(self, id_, dimension):
self._id = id_
self._dimension = dimension
@with_phil
def __hash__(self):
return hash((type(self), self._id, self._dimension))
@with_phil
def __eq__(self, other):
return hash(self) == hash(other)
@with_phil
def __iter__(self):
yield from self.keys()
@with_phil
def __len__(self):
return h5ds.get_num_scales(self._id, self._dimension)
@with_phil
def __getitem__(self, item):
if isinstance(item, int):
scales = []
h5ds.iterate(self._id, self._dimension, scales.append, 0)
return Dataset(scales[item])
else:
def f(dsid):
""" Iterate over scales to find a matching name """
if h5ds.get_scale_name(dsid) == self._e(item):
return dsid
res = h5ds.iterate(self._id, self._dimension, f, 0)
if res is None:
raise KeyError(item)
return Dataset(res)
def attach_scale(self, dset):
""" Attach a scale to this dimension.
Provide the Dataset of the scale you would like to attach.
"""
with phil:
h5ds.attach_scale(self._id, dset.id, self._dimension)
def detach_scale(self, dset):
""" Remove a scale from this dimension.
Provide the Dataset of the scale you would like to remove.
"""
with phil:
h5ds.detach_scale(self._id, dset.id, self._dimension)
def items(self):
""" Get a list of (name, Dataset) pairs with all scales on this
dimension.
"""
with phil:
scales = []
# H5DSiterate raises an error if there are no dimension scales,
# rather than iterating 0 times. See #483.
if len(self) > 0:
h5ds.iterate(self._id, self._dimension, scales.append, 0)
return [
(self._d(h5ds.get_scale_name(x)), Dataset(x))
for x in scales
]
def keys(self):
""" Get a list of names for the scales on this dimension. """
with phil:
return [key for (key, _) in self.items()]
def values(self):
""" Get a list of Dataset for scales on this dimension. """
with phil:
return [val for (_, val) in self.items()]
@with_phil
def __repr__(self):
if not self._id:
return "<Dimension of closed HDF5 dataset>"
return ('<"%s" dimension %d of HDF5 dataset at %s>'
% (self.label, self._dimension, id(self._id)))
class DimensionManager(base.CommonStateObject):
"""
Represents a collection of dimension associated with a dataset.
Like AttributeManager, an instance of this class is returned when
accessing the ".dims" property on a Dataset.
"""
@with_phil
def __init__(self, parent):
""" Private constructor.
"""
self._id = parent.id
@with_phil
def __getitem__(self, index):
""" Return a Dimension object
"""
if index > len(self) - 1:
raise IndexError('Index out of range')
return DimensionProxy(self._id, index)
@with_phil
def __len__(self):
""" Number of dimensions associated with the dataset. """
return self._id.rank
@with_phil
def __iter__(self):
""" Iterate over the dimensions. """
for i in range(len(self)):
yield self[i]
@with_phil
def __repr__(self):
if not self._id:
return "<Dimensions of closed HDF5 dataset>"
return "<Dimensions of HDF5 object at %s>" % id(self._id)
def create_scale(self, dset, name=''):
""" Create a new dimension, from an initial scale.
Provide the dataset and a name for the scale.
"""
warnings.warn("other_ds.dims.create_scale(ds, name) is deprecated. "
"Use ds.make_scale(name) instead.",
H5pyDeprecationWarning, stacklevel=2,
)
dset.make_scale(name)
@@ -0,0 +1,664 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements high-level support for HDF5 file objects.
"""
import inspect
import os
import sys
from warnings import warn
from .compat import filename_decode, filename_encode
from .base import phil, with_phil
from .group import Group
from .. import h5, h5f, h5p, h5i, h5fd, _objects
from .. import version
mpi = h5.get_config().mpi
ros3 = h5.get_config().ros3
direct_vfd = h5.get_config().direct_vfd
hdf5_version = version.hdf5_version_tuple[0:3]
swmr_support = True
libver_dict = {'earliest': h5f.LIBVER_EARLIEST, 'latest': h5f.LIBVER_LATEST,
'v108': h5f.LIBVER_V18, 'v110': h5f.LIBVER_V110}
libver_dict_r = dict((y, x) for x, y in libver_dict.items())
if hdf5_version >= (1, 11, 4):
libver_dict.update({'v112': h5f.LIBVER_V112})
libver_dict_r.update({h5f.LIBVER_V112: 'v112'})
if hdf5_version >= (1, 13, 0):
libver_dict.update({'v114': h5f.LIBVER_V114})
libver_dict_r.update({h5f.LIBVER_V114: 'v114'})
if hdf5_version >= (2, 0, 0):
libver_dict.update({'v200': h5f.LIBVER_V200})
libver_dict_r.update({h5f.LIBVER_V200: 'v200'})
def _set_fapl_mpio(plist, **kwargs):
"""Set file access property list for mpio driver"""
if not mpi:
raise ValueError("h5py was built without MPI support, can't use mpio driver")
import mpi4py.MPI
kwargs.setdefault('info', mpi4py.MPI.Info())
plist.set_fapl_mpio(**kwargs)
def _set_fapl_fileobj(plist, **kwargs):
"""Set the Python file object driver in a file access property list"""
plist.set_fileobj_driver(h5fd.fileobj_driver, kwargs.get('fileobj'))
_drivers = {
'sec2': lambda plist, **kwargs: plist.set_fapl_sec2(**kwargs),
'stdio': lambda plist, **kwargs: plist.set_fapl_stdio(**kwargs),
'core': lambda plist, **kwargs: plist.set_fapl_core(**kwargs),
'family': lambda plist, **kwargs: plist.set_fapl_family(
memb_fapl=plist.copy(),
**kwargs
),
'mpio': _set_fapl_mpio,
'fileobj': _set_fapl_fileobj,
'split': lambda plist, **kwargs: plist.set_fapl_split(**kwargs),
}
if ros3:
_drivers['ros3'] = lambda plist, **kwargs: plist.set_fapl_ros3(**kwargs)
if direct_vfd:
_drivers['direct'] = lambda plist, **kwargs: plist.set_fapl_direct(**kwargs) # noqa
def register_driver(name, set_fapl):
"""Register a custom driver.
Parameters
----------
name : str
The name of the driver.
set_fapl : callable[PropFAID, **kwargs] -> NoneType
The function to set the fapl to use your custom driver.
"""
_drivers[name] = set_fapl
def unregister_driver(name):
"""Unregister a custom driver.
Parameters
----------
name : str
The name of the driver.
"""
del _drivers[name]
def registered_drivers():
"""Return a frozenset of the names of all of the registered drivers.
"""
return frozenset(_drivers)
def make_fapl(
driver, libver=None, rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None,
locking=None, page_buf_size=None, min_meta_keep=0, min_raw_keep=0,
alignment_threshold=1, alignment_interval=1, meta_block_size=None,
**kwds
):
""" Set up a file access property list """
plist = h5p.create(h5p.FILE_ACCESS)
if libver is not None:
if libver in libver_dict:
low = libver_dict[libver]
high = h5f.LIBVER_LATEST
else:
low, high = (libver_dict[x] for x in libver)
else:
# we default to earliest
low, high = h5f.LIBVER_EARLIEST, h5f.LIBVER_LATEST
plist.set_libver_bounds(low, high)
plist.set_alignment(alignment_threshold, alignment_interval)
cache_settings = list(plist.get_cache())
if rdcc_nslots is not None:
cache_settings[1] = rdcc_nslots
if rdcc_nbytes is not None:
cache_settings[2] = rdcc_nbytes
if rdcc_w0 is not None:
cache_settings[3] = rdcc_w0
plist.set_cache(*cache_settings)
if page_buf_size:
plist.set_page_buffer_size(int(page_buf_size), int(min_meta_keep),
int(min_raw_keep))
if meta_block_size is not None:
plist.set_meta_block_size(int(meta_block_size))
if locking is not None:
if locking in ("false", False):
plist.set_file_locking(False, ignore_when_disabled=False)
elif locking in ("true", True):
plist.set_file_locking(True, ignore_when_disabled=False)
elif locking == "best-effort":
plist.set_file_locking(True, ignore_when_disabled=True)
else:
raise ValueError(f"Unsupported locking value: {locking}")
if driver is None or (driver == 'windows' and sys.platform == 'win32'):
# Prevent swallowing unused key arguments
if kwds:
msg = "'{key}' is an invalid keyword argument for this function" \
.format(key=next(iter(kwds)))
raise TypeError(msg)
return plist
try:
set_fapl = _drivers[driver]
except KeyError as exc:
raise ValueError(f'Unknown driver type {driver!r}') from exc
else:
if driver == 'ros3':
token = kwds.pop('session_token', None)
set_fapl(plist, **kwds)
if token:
if hdf5_version < (1, 14, 2):
raise ValueError('HDF5 >= 1.14.2 required for AWS session token')
plist.set_fapl_ros3_token(token)
else:
set_fapl(plist, **kwds)
return plist
def make_fcpl(track_order=False, track_times=False, fs_strategy=None, fs_persist=False,
fs_threshold=1, fs_page_size=None):
""" Set up a file creation property list """
plist = h5p.create(h5p.FILE_CREATE)
if track_order:
plist.set_link_creation_order(
h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)
plist.set_attr_creation_order(
h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)
if track_times is None:
track_times = False # Allow explicit None to mean h5py's default
if track_times in (True, False):
plist.set_obj_track_times(track_times)
else:
raise TypeError("track_times must be either True or False")
if fs_strategy:
strategies = {
'fsm': h5f.FSPACE_STRATEGY_FSM_AGGR,
'page': h5f.FSPACE_STRATEGY_PAGE,
'aggregate': h5f.FSPACE_STRATEGY_AGGR,
'none': h5f.FSPACE_STRATEGY_NONE
}
fs_strat_num = strategies.get(fs_strategy, -1)
if fs_strat_num == -1:
raise ValueError("Invalid file space strategy type")
plist.set_file_space_strategy(fs_strat_num, fs_persist, fs_threshold)
if fs_page_size and fs_strategy == 'page':
plist.set_file_space_page_size(int(fs_page_size))
return plist
def make_fid(name, mode, userblock_size, fapl, fcpl=None, swmr=False):
""" Get a new FileID by opening or creating a file.
Also validates mode argument."""
if userblock_size is not None:
if mode in ('r', 'r+'):
raise ValueError("User block may only be specified "
"when creating a file")
try:
userblock_size = int(userblock_size)
except (TypeError, ValueError):
raise ValueError("User block size must be an integer") from None
if fcpl is None:
fcpl = h5p.create(h5p.FILE_CREATE)
fcpl.set_userblock(userblock_size)
if mode == 'r':
flags = h5f.ACC_RDONLY
if swmr and swmr_support:
flags |= h5f.ACC_SWMR_READ
fid = h5f.open(name, flags, fapl=fapl)
elif mode == 'r+':
fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)
elif mode in ['w-', 'x']:
fid = h5f.create(name, h5f.ACC_EXCL, fapl=fapl, fcpl=fcpl)
elif mode == 'w':
fid = h5f.create(name, h5f.ACC_TRUNC, fapl=fapl, fcpl=fcpl)
elif mode == 'a':
# Open in append mode (read/write).
# If that fails, create a new file only if it won't clobber an
# existing one (ACC_EXCL)
try:
fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)
# Not all drivers raise FileNotFoundError (commented those that do not)
except FileNotFoundError if fapl.get_driver() in (
h5fd.SEC2,
h5fd.DIRECT if direct_vfd else -1,
# h5fd.STDIO,
# h5fd.CORE,
h5fd.FAMILY,
h5fd.WINDOWS,
# h5fd.MPIO,
# h5fd.MPIPOSIX,
h5fd.fileobj_driver,
h5fd.ROS3D if ros3 else -1,
) else OSError:
fid = h5f.create(name, h5f.ACC_EXCL, fapl=fapl, fcpl=fcpl)
else:
raise ValueError("Invalid mode; must be one of r, r+, w, w-, x, a")
try:
if userblock_size is not None:
existing_fcpl = fid.get_create_plist()
if existing_fcpl.get_userblock() != userblock_size:
raise ValueError("Requested userblock size (%d) does not match that of existing file (%d)" % (userblock_size, existing_fcpl.get_userblock()))
except Exception as e:
fid.close()
raise e
return fid
class File(Group):
"""
Represents an HDF5 file.
"""
@property
def attrs(self):
""" Attributes attached to this object """
# hdf5 complains that a file identifier is an invalid location for an
# attribute. Instead of self, pass the root group to AttributeManager:
from . import attrs
with phil:
return attrs.AttributeManager(self['/'])
@property
@with_phil
def filename(self):
"""File name on disk"""
return filename_decode(h5f.get_name(self.id))
@property
@with_phil
def driver(self):
"""Low-level HDF5 file driver used to open file"""
drivers = {h5fd.SEC2: 'sec2',
h5fd.STDIO: 'stdio',
h5fd.CORE: 'core',
h5fd.FAMILY: 'family',
h5fd.WINDOWS: 'windows',
h5fd.MPIO: 'mpio',
h5fd.MPIPOSIX: 'mpiposix',
h5fd.fileobj_driver: 'fileobj'}
if ros3:
drivers[h5fd.ROS3D] = 'ros3'
if direct_vfd:
drivers[h5fd.DIRECT] = 'direct'
return drivers.get(self.id.get_access_plist().get_driver(), 'unknown')
@property
@with_phil
def mode(self):
""" Python mode used to open file """
write_intent = h5f.ACC_RDWR
if swmr_support:
write_intent |= h5f.ACC_SWMR_WRITE
return 'r+' if self.id.get_intent() & write_intent else 'r'
@property
@with_phil
def libver(self):
"""File format version bounds (2-tuple: low, high)"""
bounds = self.id.get_access_plist().get_libver_bounds()
return tuple(libver_dict_r[x] for x in bounds)
@property
@with_phil
def userblock_size(self):
""" User block size (in bytes) """
fcpl = self.id.get_create_plist()
return fcpl.get_userblock()
@property
@with_phil
def meta_block_size(self):
""" Meta block size (in bytes) """
fapl = self.id.get_access_plist()
return fapl.get_meta_block_size()
if mpi:
@property
@with_phil
def atomic(self):
""" Set/get MPI-IO atomic mode
"""
return self.id.get_mpi_atomicity()
@atomic.setter
@with_phil
def atomic(self, value):
# pylint: disable=missing-docstring
self.id.set_mpi_atomicity(value)
@property
@with_phil
def swmr_mode(self):
""" Controls single-writer multiple-reader mode """
return swmr_support and bool(self.id.get_intent() & (h5f.ACC_SWMR_READ | h5f.ACC_SWMR_WRITE))
@swmr_mode.setter
@with_phil
def swmr_mode(self, value):
# pylint: disable=missing-docstring
if value:
self.id.start_swmr_write()
else:
raise ValueError("It is not possible to forcibly switch SWMR mode off.")
def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None, swmr=False,
rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, track_order=None,
fs_strategy=None, fs_persist=False, fs_threshold=1, fs_page_size=None,
page_buf_size=None, min_meta_keep=0, min_raw_keep=0, locking=None,
alignment_threshold=1, alignment_interval=1, meta_block_size=None,
*, track_times=False, **kwds):
"""Create a new file object.
See the h5py user guide for a detailed explanation of the options.
name
Name of the file on disk, or file-like object. Note: for files
created with the 'core' driver, HDF5 still requires this be
non-empty.
mode
r Readonly, file must exist (default)
r+ Read/write, file must exist
w Create file, truncate if exists
w- or x Create file, fail if exists
a Read/write if exists, create otherwise
driver
Name of the driver to use. Legal values are None (default,
recommended), 'core', 'sec2', 'direct', 'stdio', 'mpio', 'ros3'.
libver
Library version bounds. Supported values: 'earliest', 'v108',
'v110', 'v112', 'v114', 'v200' and 'latest' depending on the
version of libhdf5 h5py is built against.
userblock_size
Desired size of user block. Only allowed when creating a new
file (mode w, w- or x).
swmr
Open the file in SWMR read mode. Only used when mode = 'r'.
rdcc_nslots
The number of chunk slots in the raw data chunk cache for this
file. Increasing this value reduces the number of cache collisions,
but slightly increases the memory used. Due to the hashing
strategy, this value should ideally be a prime number. As a rule of
thumb, this value should be at least 10 times the number of chunks
that can fit in rdcc_nbytes bytes. For maximum performance, this
value should be set approximately 100 times that number of
chunks. The default value is 521. Applies to all datasets unless individually changed.
rdcc_nbytes
Total size of the dataset chunk cache in bytes. The default size per
dataset is 1024**2 (1 MiB) for HDF5 before 2.0 and 8 MiB for HDF5
2.0 and later. Applies to all datasets unless individually changed.
rdcc_w0
The chunk preemption policy for all datasets. This must be
between 0 and 1 inclusive and indicates the weighting according to
which chunks which have been fully read or written are penalized
when determining which chunks to flush from cache. A value of 0
means fully read or written chunks are treated no differently than
other chunks (the preemption is strictly LRU) while a value of 1
means fully read or written chunks are always preempted before
other chunks. If your application only reads or writes data once,
this can be safely set to 1. Otherwise, this should be set lower
depending on how often you re-read or re-write the same data. The
default value is 0.75. Applies to all datasets unless individually changed.
track_order
Track dataset/group/attribute creation order under root group
if True. If None use global default h5.get_config().track_order.
track_times: bool or None, default: False
If True, store timestamps for this group in the file.
If None, fall back to the default value.
fs_strategy
The file space handling strategy to be used. Only allowed when
creating a new file (mode w, w- or x). Defined as:
"fsm" FSM, Aggregators, VFD
"page" Paged FSM, VFD
"aggregate" Aggregators, VFD
"none" VFD
If None use HDF5 defaults.
fs_page_size
File space page size in bytes. Only used when fs_strategy="page". If
None use the HDF5 default (4096 bytes).
fs_persist
A boolean value to indicate whether free space should be persistent
or not. Only allowed when creating a new file. The default value
is False.
fs_threshold
The smallest free-space section size that the free space manager
will track. Only allowed when creating a new file. The default
value is 1.
page_buf_size
Page buffer size in bytes. Only allowed for HDF5 files created with
fs_strategy="page". Must be a power of two value and greater or
equal than the file space page size when creating the file. It is
not used by default.
min_meta_keep
Minimum percentage of metadata to keep in the page buffer before
allowing pages containing metadata to be evicted. Applicable only if
page_buf_size is set. Default value is zero.
min_raw_keep
Minimum percentage of raw data to keep in the page buffer before
allowing pages containing raw data to be evicted. Applicable only if
page_buf_size is set. Default value is zero.
locking
The file locking behavior. Defined as:
- False (or "false") -- Disable file locking
- True (or "true") -- Enable file locking
- "best-effort" -- Enable file locking but ignore some errors
- None -- Use HDF5 defaults
.. warning::
The HDF5_USE_FILE_LOCKING environment variable can override
this parameter.
alignment_threshold
Together with ``alignment_interval``, this property ensures that
any file object greater than or equal in size to the alignment
threshold (in bytes) will be aligned on an address which is a
multiple of alignment interval.
alignment_interval
This property should be used in conjunction with
``alignment_threshold``. See the description above. For more
details, see
https://support.hdfgroup.org/documentation/hdf5/latest/group___f_a_p_l.html#gab99d5af749aeb3896fd9e3ceb273677a
meta_block_size
Set the current minimum size, in bytes, of new metadata block allocations.
See https://support.hdfgroup.org/documentation/hdf5/latest/group___f_a_p_l.html#ga8822e3dedc8e1414f20871a87d533cb1
Additional keywords
Passed on to the selected file driver.
"""
if driver == 'ros3':
if not ros3:
raise ValueError("h5py was built without ROS3 support, can't use ros3 driver")
if hdf5_version < (2, 0, 0):
from urllib.parse import urlparse
url = urlparse(name)
if url.scheme == 's3':
aws_region = kwds.get('aws_region', b'').decode('ascii')
if len(aws_region) == 0:
raise ValueError('AWS region required for s3:// location')
name = f'https://s3.{aws_region}.amazonaws.com/{url.netloc}{url.path}'
elif url.scheme not in ('https', 'http'):
raise ValueError(f'{name}: S3 location must begin with '
'either "https://", "http://", or "s3://"')
if isinstance(name, _objects.ObjectID):
if fs_strategy:
raise ValueError("Unable to set file space strategy of an existing file")
with phil:
fid = h5i.get_file_id(name)
else:
if hasattr(name, 'read') and hasattr(name, 'seek'):
if driver not in (None, 'fileobj'):
raise ValueError("Driver must be 'fileobj' for file-like object if specified.")
driver = 'fileobj'
if kwds.get('fileobj', name) != name:
raise ValueError("Invalid value of 'fileobj' argument; "
"must equal to file-like object if specified.")
kwds.update(fileobj=name)
name = repr(name).encode('ASCII', 'replace')
else:
name = filename_encode(name)
if track_order is None:
track_order = h5.get_config().track_order
if fs_strategy and mode not in ('w', 'w-', 'x'):
raise ValueError("Unable to set file space strategy of an existing file")
if swmr and mode != 'r':
warn(
"swmr=True only affects read ('r') mode. For swmr write "
"mode, set f.swmr_mode = True after opening the file.",
stacklevel=2,
)
with phil:
fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
locking, page_buf_size, min_meta_keep, min_raw_keep,
alignment_threshold=alignment_threshold,
alignment_interval=alignment_interval,
meta_block_size=meta_block_size,
**kwds)
fcpl = make_fcpl(track_order=track_order, track_times=track_times,
fs_strategy=fs_strategy, fs_persist=fs_persist,
fs_threshold=fs_threshold, fs_page_size=fs_page_size)
fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
if isinstance(libver, tuple):
self._libver = libver
else:
self._libver = (libver, 'latest')
super().__init__(fid)
_in_memory_file_counter = 0
@classmethod
@with_phil
def in_memory(cls, file_image=None, **kwargs):
"""Create an HDF5 file in memory, without an underlying file
file_image
The initial file contents as bytes (or anything that supports the
Python buffer interface). HDF5 takes a copy of this data.
block_size
Chunk size for new memory alloactions (default 64 KiB).
Other keyword arguments are like File(), although name, mode,
driver and locking can't be passed.
"""
for k in ('driver', 'locking', 'backing_store'):
if k in kwargs:
raise TypeError(
f"File.in_memory() got an unexpected keyword argument {k!r}"
)
fcpl_kwargs = {}
for k in inspect.signature(make_fcpl).parameters:
if k in kwargs:
fcpl_kwargs[k] = kwargs.pop(k)
fcpl = make_fcpl(**fcpl_kwargs)
fapl = make_fapl(driver="core", backing_store=False, **kwargs)
if file_image:
if fcpl_kwargs:
kw = ', '.join(fcpl_kwargs)
raise TypeError(f"{kw} parameters cannot be used with file_image")
fapl.set_file_image(file_image)
# We have to give HDF5 a filename, but it should never use it.
# This is a hint both in memory, and in case a bug ever creates a file.
# The name also needs to be different from any other open file;
# we use a simple counter (protected by the 'phil' lock) for this.
name = b"h5py_in_memory_nonfile_%d" % cls._in_memory_file_counter
cls._in_memory_file_counter += 1
if file_image:
fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)
else:
fid = h5f.create(name, h5f.ACC_EXCL, fapl=fapl, fcpl=fcpl)
return cls(fid)
def close(self):
""" Close the file. All open objects become invalid """
with phil:
# Check that the file is still open, otherwise skip
if self.id.valid:
# We have to explicitly murder all open objects related to the file
# Close file-resident objects first, then the files.
# Otherwise we get errors in MPI mode.
self.id._close_open_objects(h5f.OBJ_LOCAL | ~h5f.OBJ_FILE)
self.id._close_open_objects(h5f.OBJ_LOCAL | h5f.OBJ_FILE)
self.id.close()
_objects.nonlocal_close()
def flush(self):
""" Tell the HDF5 library to flush its buffers.
"""
with phil:
h5f.flush(self.id)
@with_phil
def __enter__(self):
return self
@with_phil
def __exit__(self, *args):
if self.id:
self.close()
@with_phil
def __repr__(self):
if not self.id:
r = '<Closed HDF5 file>'
else:
# Filename has to be forced to Unicode if it comes back bytes
# Mode is always a "native" string
filename = self.filename
if isinstance(filename, bytes): # Can't decode fname
filename = filename.decode('utf8', 'replace')
r = f'<HDF5 file "{os.path.basename(filename)}" (mode {self.mode})>'
return r
@@ -0,0 +1,412 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements support for HDF5 compression filters via the high-level
interface. The following types of filter are available:
"gzip"
Standard DEFLATE-based compression, at integer levels from 0 to 9.
Built-in to all public versions of HDF5. Use this if you want a
decent-to-good ratio, good portability, and don't mind waiting.
"lzf"
Custom compression filter for h5py. This filter is much, much faster
than gzip (roughly 10x in compression vs. gzip level 4, and 3x faster
in decompressing), but at the cost of a worse compression ratio. Use
this if you want cheap compression and portability is not a concern.
"szip"
Access to the HDF5 SZIP encoder. SZIP is a non-mainstream compression
format used in space science on integer and float datasets. SZIP is
subject to license requirements, which means the encoder is not
guaranteed to be always available. However, it is also much faster
than gzip.
The following constants in this module are also useful:
decode
Tuple of available filter names for decoding
encode
Tuple of available filter names for encoding
"""
from collections.abc import Mapping
import operator
import numpy as np
from .base import product
from .compat import filename_encode
from .. import h5z, h5p, h5d, h5f
_COMP_FILTERS = {'gzip': h5z.FILTER_DEFLATE,
'szip': h5z.FILTER_SZIP,
'lzf': h5z.FILTER_LZF,
'shuffle': h5z.FILTER_SHUFFLE,
'fletcher32': h5z.FILTER_FLETCHER32,
'scaleoffset': h5z.FILTER_SCALEOFFSET }
_FILL_TIME_ENUM = {'alloc': h5d.FILL_TIME_ALLOC,
'never': h5d.FILL_TIME_NEVER,
'ifset': h5d.FILL_TIME_IFSET,
}
DEFAULT_GZIP = 4
DEFAULT_SZIP = ('nn', 8)
def _gen_filter_tuples():
""" Bootstrap function to figure out what filters are available. """
dec = []
enc = []
for name, code in _COMP_FILTERS.items():
if h5z.filter_avail(code):
info = h5z.get_filter_info(code)
if info & h5z.FILTER_CONFIG_ENCODE_ENABLED:
enc.append(name)
if info & h5z.FILTER_CONFIG_DECODE_ENABLED:
dec.append(name)
return tuple(dec), tuple(enc)
decode, encode = _gen_filter_tuples()
def _external_entry(entry):
""" Check for and return a well-formed entry tuple for
a call to h5p.set_external. """
# We require only an iterable entry but also want to guard against
# raising a confusing exception from unpacking below a str or bytes that
# was mistakenly passed as an entry. We go further than that and accept
# only a tuple, which allows simpler documentation and exception
# messages.
if not isinstance(entry, tuple):
raise TypeError(
"Each external entry must be a tuple of (name, offset, size)")
name, offset, size = entry # raise ValueError without three elements
name = filename_encode(name)
offset = operator.index(offset)
size = operator.index(size)
return (name, offset, size)
def _normalize_external(external):
""" Normalize external into a well-formed list of tuples and return. """
if external is None:
return []
try:
# Accept a solitary name---a str, bytes, or os.PathLike acceptable to
# filename_encode.
return [_external_entry((external, 0, h5f.UNLIMITED))]
except TypeError:
pass
# Check and rebuild each entry to be well-formed.
return [_external_entry(entry) for entry in external]
class FilterRefBase(Mapping):
"""Base class for referring to an HDF5 and describing its options
Your subclass must define filter_id, and may define a filter_options tuple.
"""
filter_id = None
filter_options = ()
# Mapping interface supports using instances as **kwargs for compatibility
# with older versions of h5py
@property
def _kwargs(self):
return {
'compression': self.filter_id,
'compression_opts': self.filter_options
}
def __hash__(self):
return hash((self.filter_id, self.filter_options))
def __eq__(self, other):
return (
isinstance(other, FilterRefBase)
and self.filter_id == other.filter_id
and self.filter_options == other.filter_options
)
def __len__(self):
return len(self._kwargs)
def __iter__(self):
return iter(self._kwargs)
def __getitem__(self, item):
return self._kwargs[item]
class Gzip(FilterRefBase):
filter_id = h5z.FILTER_DEFLATE
def __init__(self, level=DEFAULT_GZIP):
self.filter_options = (level,)
def fill_dcpl(plist, shape, dtype, chunks, compression, compression_opts,
shuffle, fletcher32, maxshape, scaleoffset, external,
allow_unknown_filter=False, *, fill_time=None):
""" Generate a dataset creation property list.
Undocumented and subject to change without warning.
"""
if shape is None or shape == ():
shapetype = 'Empty' if shape is None else 'Scalar'
if any((chunks, compression, compression_opts, shuffle, fletcher32,
scaleoffset is not None)):
raise TypeError(
f"{shapetype} datasets don't support chunk/filter options"
)
if maxshape and maxshape != ():
raise TypeError(f"{shapetype} datasets cannot be extended")
return h5p.create(h5p.DATASET_CREATE)
def rq_tuple(tpl, name):
""" Check if chunks/maxshape match dataset rank """
if tpl in (None, True):
return
try:
tpl = tuple(tpl)
except TypeError as exc:
raise TypeError(f'{name!r} argument must be None or a sequence object') from exc
if len(tpl) != len(shape):
raise ValueError(f'{name!r} must have same rank as dataset shape')
rq_tuple(chunks, 'chunks')
rq_tuple(maxshape, 'maxshape')
if compression is not None:
if isinstance(compression, FilterRefBase):
compression_opts = compression.filter_options
compression = compression.filter_id
if compression not in encode and not isinstance(compression, int):
raise ValueError('Compression filter "%s" is unavailable' % compression)
if compression == 'gzip':
if compression_opts is None:
gzip_level = DEFAULT_GZIP
elif compression_opts in range(10):
gzip_level = compression_opts
else:
raise ValueError("GZIP setting must be an integer from 0-9, not %r" % compression_opts)
elif compression == 'lzf':
if compression_opts is not None:
raise ValueError("LZF compression filter accepts no options")
elif compression == 'szip':
if compression_opts is None:
compression_opts = DEFAULT_SZIP
err = "SZIP options must be a 2-tuple ('ec'|'nn', even integer 0-32)"
try:
szmethod, szpix = compression_opts
except TypeError as exc:
raise TypeError(err) from exc
if szmethod not in ('ec', 'nn'):
raise ValueError(err)
if not (0<szpix<=32 and szpix%2 == 0):
raise ValueError(err)
elif compression_opts is not None:
# Can't specify just compression_opts by itself.
raise TypeError("Compression method must be specified")
if scaleoffset is not None:
# scaleoffset must be an integer when it is not None or False,
# except for integral data, for which scaleoffset == True is
# permissible (will use SO_INT_MINBITS_DEFAULT)
if scaleoffset < 0:
raise ValueError('scale factor must be >= 0')
if dtype.kind == 'f':
if scaleoffset is True:
raise ValueError('integer scaleoffset must be provided for '
'floating point types')
elif dtype.kind in ('u', 'i'):
if scaleoffset is True:
scaleoffset = h5z.SO_INT_MINBITS_DEFAULT
else:
raise TypeError('scale/offset filter only supported for integer '
'and floating-point types')
# Scale/offset following fletcher32 in the filter chain will (almost?)
# always triggers a read error, as most scale/offset settings are
# lossy. Since fletcher32 must come first (see comment below) we
# simply prohibit the combination of fletcher32 and scale/offset.
if fletcher32:
raise ValueError('fletcher32 cannot be used with potentially lossy'
' scale/offset filter')
external = _normalize_external(external)
# End argument validation
if (chunks is True) or (chunks is None and any((
shuffle,
fletcher32,
compression,
(maxshape and not len(external)),
scaleoffset is not None,
))):
chunks = guess_chunk(shape, maxshape, dtype.itemsize)
if maxshape is True:
maxshape = (None,)*len(shape)
if chunks is not None:
plist.set_chunk(chunks)
if fill_time is not None:
if (ft := _FILL_TIME_ENUM.get(fill_time)) is not None:
plist.set_fill_time(ft)
else:
msg = ("fill_time must be one of the following choices: 'alloc', "
f"'never' or 'ifset', but it is {fill_time}.")
raise ValueError(msg)
# scale-offset must come before shuffle and compression
if scaleoffset is not None:
if dtype.kind in ('u', 'i'):
plist.set_scaleoffset(h5z.SO_INT, scaleoffset)
else: # dtype.kind == 'f'
plist.set_scaleoffset(h5z.SO_FLOAT_DSCALE, scaleoffset)
for item in external:
plist.set_external(*item)
if shuffle:
plist.set_shuffle()
if compression == 'gzip':
plist.set_deflate(gzip_level)
elif compression == 'lzf':
plist.set_filter(h5z.FILTER_LZF, h5z.FLAG_OPTIONAL)
elif compression == 'szip':
opts = {'ec': h5z.SZIP_EC_OPTION_MASK, 'nn': h5z.SZIP_NN_OPTION_MASK}
plist.set_szip(opts[szmethod], szpix)
elif isinstance(compression, int):
if not allow_unknown_filter and not h5z.filter_avail(compression):
raise ValueError("Unknown compression filter number: %s" % compression)
plist.set_filter(compression, h5z.FLAG_OPTIONAL, compression_opts)
# `fletcher32` must come after `compression`, otherwise, if `compression`
# is "szip" and the data is 64bit, the fletcher32 checksum will be wrong
# (see GitHub issue #953).
if fletcher32:
plist.set_fletcher32()
return plist
def get_filter_name(code):
"""
Return the name of the compression filter for a given filter identifier.
Undocumented and subject to change without warning.
"""
filters = {h5z.FILTER_DEFLATE: 'gzip', h5z.FILTER_SZIP: 'szip',
h5z.FILTER_SHUFFLE: 'shuffle', h5z.FILTER_FLETCHER32: 'fletcher32',
h5z.FILTER_LZF: 'lzf', h5z.FILTER_SCALEOFFSET: 'scaleoffset'}
return filters.get(code, str(code))
def get_filters(plist):
""" Extract a dictionary of active filters from a DCPL, along with
their settings.
Undocumented and subject to change without warning.
"""
pipeline = {}
nfilters = plist.get_nfilters()
for i in range(nfilters):
code, _, vals, _ = plist.get_filter(i)
if code == h5z.FILTER_DEFLATE:
vals = vals[0] # gzip level
elif code == h5z.FILTER_SZIP:
mask, pixels = vals[0:2]
if mask & h5z.SZIP_EC_OPTION_MASK:
mask = 'ec'
elif mask & h5z.SZIP_NN_OPTION_MASK:
mask = 'nn'
else:
raise TypeError("Unknown SZIP configuration")
vals = (mask, pixels)
elif code == h5z.FILTER_LZF:
vals = None
else:
if len(vals) == 0:
vals = None
pipeline[get_filter_name(code)] = vals
return pipeline
CHUNK_BASE = 16*1024 # Multiplier by which chunks are adjusted
CHUNK_MIN = 8*1024 # Soft lower limit (8k)
CHUNK_MAX = 1024*1024 # Hard upper limit (1M)
def guess_chunk(shape, maxshape, typesize):
""" Guess an appropriate chunk layout for a dataset, given its shape and
the size of each element in bytes. Will allocate chunks only as large
as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
each axis, slightly favoring bigger values for the last index.
Undocumented and subject to change without warning.
"""
# pylint: disable=unused-argument
# For unlimited dimensions we have to guess 1024
shape = tuple((x if x!=0 else 1024) for i, x in enumerate(shape))
ndims = len(shape)
if ndims == 0:
raise ValueError("Chunks not allowed for scalar datasets.")
chunks = np.array(shape, dtype='=f8')
if not np.all(np.isfinite(chunks)):
raise ValueError("Illegal value in chunk tuple")
# Determine the optimal chunk size in bytes using a PyTables expression.
# This is kept as a float.
dset_size = product(chunks)*typesize
target_size = CHUNK_BASE * (2**np.log10(dset_size/(1024.*1024)))
if target_size > CHUNK_MAX:
target_size = CHUNK_MAX
elif target_size < CHUNK_MIN:
target_size = CHUNK_MIN
idx = 0
while True:
# Repeatedly loop over the axes, dividing them by 2. Stop when:
# 1a. We're smaller than the target chunk size, OR
# 1b. We're within 50% of the target chunk size, AND
# 2. The chunk is smaller than the maximum chunk size
chunk_bytes = product(chunks)*typesize
if (chunk_bytes < target_size or \
abs(chunk_bytes-target_size)/target_size < 0.5) and \
chunk_bytes < CHUNK_MAX:
break
if product(chunks) == 1:
break # Element size larger than CHUNK_MAX
chunks[idx%ndims] = np.ceil(chunks[idx%ndims] / 2.0)
idx += 1
return tuple(int(x) for x in chunks)
@@ -0,0 +1,811 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements support for high-level access to HDF5 groups.
"""
from contextlib import contextmanager
import posixpath as pp
import numpy
from .compat import filename_decode, filename_encode
from .. import h5, h5g, h5i, h5o, h5r, h5t, h5l, h5p
from . import base
from .base import HLObject, MutableMappingHDF5, phil, with_phil
from . import dataset
from . import datatype
from .vds import vds_support
class Group(HLObject, MutableMappingHDF5):
""" Represents an HDF5 group.
"""
def __init__(self, bind):
""" Create a new Group object by binding to a low-level GroupID.
"""
with phil:
if not isinstance(bind, h5g.GroupID):
raise ValueError("%s is not a GroupID" % bind)
super().__init__(bind)
def create_group(self, name, track_order=None, *, track_times=False):
""" Create and return a new subgroup.
Name may be absolute or relative. Fails if the target name already
exists.
track_order
Track dataset/group/attribute creation order under this group
if True. If None use global default h5.get_config().track_order.
track_times: bool or None, default: False
If True, store timestamps for this group in the file.
If None, fall back to the default value.
"""
if track_order is None:
track_order = h5.get_config().track_order
with phil:
name, lcpl = self._e(name, lcpl=True)
gcpl = h5p.create(h5p.GROUP_CREATE)
if track_order:
order_flags = h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED
gcpl.set_link_creation_order(order_flags)
gcpl.set_attr_creation_order(order_flags)
if track_times is None:
track_times = False # Allow explicit None to mean h5py's default
if track_times in (True, False):
gcpl.set_obj_track_times(track_times)
else:
raise TypeError("track_times must be either True, False, or None")
gid = h5g.create(self.id, name, lcpl=lcpl, gcpl=gcpl)
return Group(gid)
def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds):
""" Create a new HDF5 dataset
name
Name of the dataset (absolute or relative). Provide None to make
an anonymous dataset.
shape
Dataset shape. Use "()" for scalar datasets. Required if "data"
isn't provided.
dtype
Numpy dtype or string. If omitted, dtype('f') will be used.
Required if "data" isn't provided; otherwise, overrides data
array's dtype.
data
Provide data to initialize the dataset. If used, you can omit
shape and dtype arguments.
Keyword-only arguments:
chunks
(Tuple or int) Chunk shape, or True to enable auto-chunking. Integers can
be used for 1D shape.
maxshape
(Tuple or int) Make the dataset resizable up to this shape. Use None for
axes within the tuple you want to be unlimited. Integers can be used for 1D shape.
For 1D datasets with unlimited maxshape, a shape tuple of length 1 must be
provided, ``(None,)``. Passing ``None`` sets ``maxshape` to `shape`, making the
dataset un-resizable, which is the default.
compression
(String or int) Compression strategy. Legal values are 'gzip',
'szip', 'lzf'. If an integer in range(10), this indicates gzip
compression level. Otherwise, an integer indicates the number of a
dynamically loaded compression filter.
compression_opts
Compression settings. This is an integer for gzip, 2-tuple for
szip, etc. If specifying a dynamically loaded compression filter
number, this must be a tuple of values.
scaleoffset
(Integer) Enable scale/offset filter for (usually) lossy
compression of integer or floating-point data. For integer
data, the value of scaleoffset is the number of bits to
retain (pass 0 to let HDF5 determine the minimum number of
bits necessary for lossless compression). For floating point
data, scaleoffset is the number of digits after the decimal
place to retain; stored values thus have absolute error
less than 0.5*10**(-scaleoffset).
shuffle
(T/F) Enable shuffle filter.
fletcher32
(T/F) Enable fletcher32 error detection. Not permitted in
conjunction with the scale/offset filter.
fillvalue
(Scalar) Use this value for uninitialized parts of the dataset.
track_times
(T/F) Enable dataset creation timestamps.
track_order
(T/F) Track attribute creation order if True. If omitted use
global default h5.get_config().track_order.
external
(Iterable of tuples) Sets the external storage property, thus
designating that the dataset will be stored in one or more
non-HDF5 files external to the HDF5 file. Adds each tuple
of (name, offset, size) to the dataset's list of external files.
Each name must be a str, bytes, or os.PathLike; each offset and
size, an integer. If only a name is given instead of an iterable
of tuples, it is equivalent to [(name, 0, h5py.h5f.UNLIMITED)].
efile_prefix
(String) External dataset file prefix for dataset access property
list. Does not persist in the file.
virtual_prefix
(String) Virtual dataset file prefix for dataset access property
list. Does not persist in the file.
allow_unknown_filter
(T/F) Do not check that the requested filter is available for use.
This should only be used with ``write_direct_chunk``, where the caller
compresses the data before handing it to h5py.
rdcc_nbytes
Total size of the dataset's chunk cache in bytes. The default size
is 1024**2 (1 MiB) for HDF5 before 2.0 and 8 MiB for HDF5 2.0 or later.
rdcc_w0
The chunk preemption policy for this dataset. This must be
between 0 and 1 inclusive and indicates the weighting according to
which chunks which have been fully read or written are penalized
when determining which chunks to flush from cache. A value of 0
means fully read or written chunks are treated no differently than
other chunks (the preemption is strictly LRU) while a value of 1
means fully read or written chunks are always preempted before
other chunks. If your application only reads or writes data once,
this can be safely set to 1. Otherwise, this should be set lower
depending on how often you re-read or re-write the same data. The
default value is 0.75.
rdcc_nslots
The number of chunk slots in the dataset's chunk cache. Increasing
this value reduces the number of cache collisions, but slightly
increases the memory used. Due to the hashing strategy, this value
should ideally be a prime number. As a rule of thumb, this value
should be at least 10 times the number of chunks that can fit in
rdcc_nbytes bytes. For maximum performance, this value should be set
approximately 100 times that number of chunks. The default value is
521.
"""
if 'track_order' not in kwds:
kwds['track_order'] = h5.get_config().track_order
if 'efile_prefix' in kwds:
kwds['efile_prefix'] = self._e(kwds['efile_prefix'])
if 'virtual_prefix' in kwds:
kwds['virtual_prefix'] = self._e(kwds['virtual_prefix'])
with phil:
group = self
if name:
name = self._e(name)
if b'/' in name.lstrip(b'/'):
parent_path, name = name.rsplit(b'/', 1)
group = self.require_group(parent_path)
dsid = dataset.make_new_dset(group, shape, dtype, data, name, **kwds)
dset = dataset.Dataset(dsid)
return dset
if vds_support:
def create_virtual_dataset(self, name, layout, fillvalue=None):
"""Create a new virtual dataset in this group.
See virtual datasets in the docs for more information.
name
(str) Name of the new dataset
layout
(VirtualLayout) Defines the sources for the virtual dataset
fillvalue
The value to use where there is no data.
"""
with phil:
group = self
if name:
name = self._e(name)
if b'/' in name.lstrip(b'/'):
parent_path, name = name.rsplit(b'/', 1)
group = self.require_group(parent_path)
dsid = layout.make_dataset(
group, name=name, fillvalue=fillvalue,
)
dset = dataset.Dataset(dsid)
return dset
@contextmanager
def build_virtual_dataset(
self, name, shape, dtype, maxshape=None, fillvalue=None
):
"""Assemble a virtual dataset in this group.
This is used as a context manager::
with f.build_virtual_dataset('virt', (10, 1000), np.uint32) as layout:
layout[0] = h5py.VirtualSource('foo.h5', 'data', (1000,))
name
(str) Name of the new dataset
shape
(tuple) Shape of the dataset
dtype
A numpy dtype for data read from the virtual dataset
maxshape
(tuple, optional) Maximum dimensions if the dataset can grow.
Use None for unlimited dimensions.
fillvalue
The value used where no data is available.
"""
from .vds import VirtualLayout
layout = VirtualLayout(shape, dtype, maxshape, self.file.filename)
yield layout
self.create_virtual_dataset(name, layout, fillvalue)
def require_dataset(self, name, shape, dtype, exact=False, **kwds):
""" Open a dataset, creating it if it doesn't exist.
If keyword "exact" is False (default), an existing dataset must have
the same shape and a conversion-compatible dtype to be returned. If
True, the shape and dtype must match exactly.
If keyword "maxshape" is given, the maxshape and dtype must match
instead.
If any of the keywords "rdcc_nslots", "rdcc_nbytes", or "rdcc_w0" are
given, they will be used to configure the dataset's chunk cache.
Other dataset keywords (see create_dataset) may be provided, but are
only used if a new dataset is to be created.
Raises TypeError if an incompatible object already exists, or if the
shape, maxshape or dtype don't match according to the above rules.
"""
if 'efile_prefix' in kwds:
kwds['efile_prefix'] = self._e(kwds['efile_prefix'])
if 'virtual_prefix' in kwds:
kwds['virtual_prefix'] = self._e(kwds['virtual_prefix'])
with phil:
if name not in self:
return self.create_dataset(name, *(shape, dtype), **kwds)
if isinstance(shape, int):
shape = (shape,)
try:
dsid = dataset.open_dset(self, self._e(name), **kwds)
dset = dataset.Dataset(dsid)
except KeyError as exc:
dset = self[name]
raise TypeError(f"Incompatible object ({dset.__class__.__name__}) already exists") from exc
if shape != dset.shape:
if "maxshape" not in kwds:
raise TypeError("Shapes do not match (existing %s vs new %s)" % (dset.shape, shape))
elif kwds["maxshape"] != dset.maxshape:
raise TypeError("Max shapes do not match (existing %s vs new %s)" % (dset.maxshape, kwds["maxshape"]))
if exact:
if dtype != dset.dtype:
raise TypeError("Datatypes do not exactly match (existing %s vs new %s)" % (dset.dtype, dtype))
elif not numpy.can_cast(dtype, dset.dtype):
raise TypeError("Datatypes cannot be safely cast (existing %s vs new %s)" % (dset.dtype, dtype))
return dset
def create_dataset_like(self, name, other, **kwupdate):
""" Create a dataset similar to `other`.
name
Name of the dataset (absolute or relative). Provide None to make
an anonymous dataset.
other
The dataset which the new dataset should mimic. All properties, such
as shape, dtype, chunking, ... will be taken from it, but no data
or attributes are being copied.
Any dataset keywords (see create_dataset) may be provided, including
shape and dtype, in which case the provided values take precedence over
those from `other`.
"""
for k in ('shape', 'dtype', 'chunks', 'compression',
'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
'fillvalue'):
kwupdate.setdefault(k, getattr(other, k))
# TODO: more elegant way to pass these (dcpl to create_dataset?)
dcpl = other.id.get_create_plist()
kwupdate.setdefault('track_times', dcpl.get_obj_track_times())
kwupdate.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
# Special case: the maxshape property always exists, but if we pass it
# to create_dataset, the new dataset will automatically get chunked
# layout. So we copy it only if it is different from shape.
if other.maxshape != other.shape:
kwupdate.setdefault('maxshape', other.maxshape)
return self.create_dataset(name, **kwupdate)
def require_group(self, name):
# TODO: support kwargs like require_dataset
"""Return a group, creating it if it doesn't exist.
TypeError is raised if something with that name already exists that
isn't a group.
"""
with phil:
if name not in self:
return self.create_group(name)
grp = self[name]
if not isinstance(grp, Group):
raise TypeError("Incompatible object (%s) already exists" % grp.__class__.__name__)
return grp
@with_phil
def __getitem__(self, name):
""" Open an object in the file """
if isinstance(name, h5r.Reference):
oid = h5r.dereference(name, self.id)
if oid is None:
raise ValueError("Invalid HDF5 object reference")
elif isinstance(name, (bytes, str)):
oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
else:
raise TypeError("Accessing a group is done with bytes or str, "
"not {}".format(type(name)))
otype = h5i.get_type(oid)
if otype == h5i.GROUP:
return Group(oid)
elif otype == h5i.DATASET:
return dataset.Dataset(oid, readonly=(self.file.mode == 'r'))
elif otype == h5i.DATATYPE:
return datatype.Datatype(oid)
else:
raise TypeError("Unknown object type")
def get(self, name, default=None, getclass=False, getlink=False):
""" Retrieve an item or other information.
"name" given only:
Return the item, or "default" if it doesn't exist
"getclass" is True:
Return the class of object (Group, Dataset, etc.), or "default"
if nothing with that name exists
"getlink" is True:
Return HardLink, SoftLink or ExternalLink instances. Return
"default" if nothing with that name exists.
"getlink" and "getclass" are True:
Return HardLink, SoftLink and ExternalLink classes. Return
"default" if nothing with that name exists.
Example:
>>> cls = group.get('foo', getclass=True)
>>> if cls == SoftLink:
"""
# pylint: disable=arguments-differ
with phil:
if not (getclass or getlink):
try:
return self[name]
except KeyError:
return default
if name not in self:
return default
elif getclass and not getlink:
typecode = h5o.get_info(self.id, self._e(name), lapl=self._lapl).type
try:
return {h5o.TYPE_GROUP: Group,
h5o.TYPE_DATASET: dataset.Dataset,
h5o.TYPE_NAMED_DATATYPE: datatype.Datatype}[typecode]
except KeyError as exc:
raise TypeError("Unknown object type") from exc
elif getlink:
typecode = self.id.links.get_info(self._e(name), lapl=self._lapl).type
if typecode == h5l.TYPE_SOFT:
if getclass:
return SoftLink
linkbytes = self.id.links.get_val(self._e(name), lapl=self._lapl)
return SoftLink(self._d(linkbytes))
elif typecode == h5l.TYPE_EXTERNAL:
if getclass:
return ExternalLink
filebytes, linkbytes = self.id.links.get_val(self._e(name), lapl=self._lapl)
return ExternalLink(
filename_decode(filebytes), self._d(linkbytes)
)
elif typecode == h5l.TYPE_HARD:
return HardLink if getclass else HardLink()
else:
raise TypeError("Unknown link type")
def __setitem__(self, name, obj):
""" Add an object to the group. The name must not already be in use.
The action taken depends on the type of object assigned:
Named HDF5 object (Dataset, Group, Datatype)
A hard link is created at "name" which points to the
given object.
SoftLink or ExternalLink
Create the corresponding link.
Numpy ndarray
The array is converted to a dataset object, with default
settings (contiguous storage, etc.).
Numpy dtype
Commit a copy of the datatype as a named datatype in the file.
Anything else
Attempt to convert it to an ndarray and store it. Scalar
values are stored as scalar datasets. Raise ValueError if we
can't understand the resulting array dtype.
"""
with phil:
name, lcpl = self._e(name, lcpl=True)
if isinstance(obj, HLObject):
h5o.link(obj.id, self.id, name, lcpl=lcpl, lapl=self._lapl)
elif isinstance(obj, SoftLink):
self.id.links.create_soft(name, self._e(obj.path), lcpl=lcpl, lapl=self._lapl)
elif isinstance(obj, ExternalLink):
fn = filename_encode(obj.filename)
self.id.links.create_external(name, fn, self._e(obj.path),
lcpl=lcpl, lapl=self._lapl)
elif isinstance(obj, numpy.dtype):
htype = h5t.py_create(obj, logical=True)
htype.commit(self.id, name, lcpl=lcpl)
else:
ds = self.create_dataset(None, data=obj)
h5o.link(ds.id, self.id, name, lcpl=lcpl)
@with_phil
def __delitem__(self, name):
""" Delete (unlink) an item from this group. """
self.id.unlink(self._e(name))
@with_phil
def __len__(self):
""" Number of members attached to this group """
return self.id.get_num_objs()
@with_phil
def __iter__(self):
""" Iterate over member names """
for x in self.id.__iter__():
yield self._d(x)
@with_phil
def __reversed__(self):
""" Iterate over member names in reverse order. """
for x in self.id.__reversed__():
yield self._d(x)
@with_phil
def __contains__(self, name):
""" Test if a member name exists """
if hasattr(h5g, "_path_valid"):
if not self.id:
return False
return h5g._path_valid(self.id, self._e(name), self._lapl)
return self._e(name) in self.id
def copy(self, source, dest, name=None,
shallow=False, expand_soft=False, expand_external=False,
expand_refs=False, without_attrs=False):
"""Copy an object or group.
The source can be a path, Group, Dataset, or Datatype object. The
destination can be either a path or a Group object. The source and
destinations need not be in the same file.
If the source is a Group object, all objects contained in that group
will be copied recursively.
When the destination is a Group object, by default the target will
be created in that group with its current name (basename of obj.name).
You can override that by setting "name" to a string.
There are various options which all default to "False":
- shallow: copy only immediate members of a group.
- expand_soft: expand soft links into new objects.
- expand_external: expand external links into new objects.
- expand_refs: copy objects that are pointed to by references.
- without_attrs: copy object without copying attributes.
Example:
>>> f = File('myfile.hdf5', 'w')
>>> f.create_group("MyGroup")
>>> list(f.keys())
['MyGroup']
>>> f.copy('MyGroup', 'MyCopy')
>>> list(f.keys())
['MyGroup', 'MyCopy']
"""
with phil:
if isinstance(source, HLObject):
source_path = '.'
else:
# Interpret source as a path relative to this group
source_path = source
source = self
if isinstance(dest, Group):
if name is not None:
dest_path = name
elif source_path == '.':
dest_path = pp.basename(h5i.get_name(source.id))
else:
# copy source into dest group: dest_name/source_name
dest_path = pp.basename(h5i.get_name(source[source_path].id))
elif isinstance(dest, HLObject):
raise TypeError("Destination must be path or Group object")
else:
# Interpret destination as a path relative to this group
dest_path = dest
dest = self
flags = 0
if shallow:
flags |= h5o.COPY_SHALLOW_HIERARCHY_FLAG
if expand_soft:
flags |= h5o.COPY_EXPAND_SOFT_LINK_FLAG
if expand_external:
flags |= h5o.COPY_EXPAND_EXT_LINK_FLAG
if expand_refs:
flags |= h5o.COPY_EXPAND_REFERENCE_FLAG
if without_attrs:
flags |= h5o.COPY_WITHOUT_ATTR_FLAG
if flags:
copypl = h5p.create(h5p.OBJECT_COPY)
copypl.set_copy_object(flags)
else:
copypl = None
h5o.copy(source.id, self._e(source_path), dest.id, self._e(dest_path),
copypl, base.dlcpl)
def move(self, source, dest):
""" Move a link to a new location in the file.
If "source" is a hard link, this effectively renames the object. If
"source" is a soft or external link, the link itself is moved, with its
value unmodified.
"""
with phil:
if source == dest:
return
self.id.links.move(self._e(source), self.id, self._e(dest),
lapl=self._lapl, lcpl=self._lcpl)
def visit(self, func):
""" Recursively visit all names in this group and subgroups.
Note: visit ignores soft and external links. To visit those, use
visit_links.
You supply a callable (function, method or callable object); it
will be called exactly once for each link in this group and every
group below it. Your callable must conform to the signature:
func(<member name>) => <None or return value>
Returning None continues iteration, returning anything else stops
and immediately returns that value from the visit method. The
iteration order is lexicographic.
Example:
>>> # List the entire contents of the file
>>> f = File("foo.hdf5")
>>> list_of_names = []
>>> f.visit(list_of_names.append)
"""
with phil:
def proxy(name):
""" Call the function with the text name, not bytes """
return func(self._d(name))
return h5o.visit(self.id, proxy)
def visititems(self, func):
""" Recursively visit names and objects in this group.
Note: visititems ignores soft and external links. To visit those, use
visititems_links.
You supply a callable (function, method or callable object); it
will be called exactly once for each link in this group and every
group below it. Your callable must conform to the signature:
func(<member name>, <object>) => <None or return value>
Returning None continues iteration, returning anything else stops
and immediately returns that value from the visit method. The
iteration order is lexicographic.
Example:
# Get a list of all datasets in the file
>>> mylist = []
>>> def func(name, obj):
... if isinstance(obj, Dataset):
... mylist.append(name)
...
>>> f = File('foo.hdf5')
>>> f.visititems(func)
"""
with phil:
def proxy(name):
""" Use the text name of the object, not bytes """
name = self._d(name)
return func(name, self[name])
return h5o.visit(self.id, proxy)
def visit_links(self, func):
""" Recursively visit all names in this group and subgroups.
Each link will be visited exactly once, regardless of its target.
You supply a callable (function, method or callable object); it
will be called exactly once for each link in this group and every
group below it. Your callable must conform to the signature:
func(<member name>) => <None or return value>
Returning None continues iteration, returning anything else stops
and immediately returns that value from the visit method. The
iteration order is lexicographic.
Example:
>>> # List the entire contents of the file
>>> f = File("foo.hdf5")
>>> list_of_names = []
>>> f.visit_links(list_of_names.append)
"""
with phil:
def proxy(name):
""" Call the function with the text name, not bytes """
return func(self._d(name))
return self.id.links.visit(proxy)
def visititems_links(self, func):
""" Recursively visit links in this group.
Each link will be visited exactly once, regardless of its target.
You supply a callable (function, method or callable object); it
will be called exactly once for each link in this group and every
group below it. Your callable must conform to the signature:
func(<member name>, <link>) => <None or return value>
Returning None continues iteration, returning anything else stops
and immediately returns that value from the visit method. The
iteration order is lexicographic.
Example:
# Get a list of all softlinks in the file
>>> mylist = []
>>> def func(name, link):
... if isinstance(link, SoftLink):
... mylist.append(name)
...
>>> f = File('foo.hdf5')
>>> f.visititems_links(func)
"""
with phil:
def proxy(name):
""" Use the text name of the object, not bytes """
name = self._d(name)
return func(name, self.get(name, getlink=True))
return self.id.links.visit(proxy)
@with_phil
def __repr__(self):
if not self:
r = u"<Closed HDF5 group>"
else:
namestr = (
'"%s"' % self.name
) if self.name is not None else u"(anonymous)"
r = '<HDF5 group %s (%d members)>' % (namestr, len(self))
return r
class HardLink:
"""
Represents a hard link in an HDF5 file. Provided only so that
Group.get works in a sensible way. Has no other function.
"""
pass
class SoftLink:
"""
Represents a symbolic ("soft") link in an HDF5 file. The path
may be absolute or relative. No checking is performed to ensure
that the target actually exists.
"""
@property
def path(self):
""" Soft link value. Not guaranteed to be a valid path. """
return self._path
def __init__(self, path):
self._path = str(path)
def __repr__(self):
return '<SoftLink to "%s">' % self.path
class ExternalLink:
"""
Represents an HDF5 external link. Paths may be absolute or relative.
No checking is performed to ensure either the target or file exists.
"""
@property
def path(self):
""" Soft link path, i.e. the part inside the HDF5 file. """
return self._path
@property
def filename(self):
""" Path to the external HDF5 file in the filesystem. """
return self._filename
def __init__(self, filename, path):
self._filename = filename_decode(filename_encode(filename))
self._path = path
def __repr__(self):
return '<ExternalLink to "%s" in file "%s"' % (self.path,
self.filename)
@@ -0,0 +1,439 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
High-level access to HDF5 dataspace selections
"""
import numpy as np
from .base import product
from .. import h5s, h5r, _selector
def select(shape, args, dataset=None):
""" High-level routine to generate a selection from arbitrary arguments
to __getitem__. The arguments should be the following:
shape
Shape of the "source" dataspace.
args
Either a single argument or a tuple of arguments. See below for
supported classes of argument.
dataset
A h5py.Dataset instance representing the source dataset.
Argument classes:
Single Selection instance
Returns the argument.
numpy.ndarray
Must be a boolean mask. Returns a PointSelection instance.
RegionReference
Returns a Selection instance.
Indices, slices, ellipses, MultiBlockSlices only
Returns a SimpleSelection instance
Indices, slices, ellipses, lists or boolean index arrays
Returns a FancySelection instance.
"""
if not isinstance(args, tuple):
args = (args,)
# "Special" indexing objects
if len(args) == 1:
arg = args[0]
if isinstance(arg, Selection):
if arg.shape != shape:
raise TypeError("Mismatched selection shape")
return arg
elif isinstance(arg, np.ndarray) and arg.dtype.kind == 'b':
if arg.shape != shape:
raise TypeError("Boolean indexing array has incompatible shape")
return PointSelection.from_mask(arg)
elif isinstance(arg, h5r.RegionReference):
if dataset is None:
raise TypeError("Cannot apply a region reference without a dataset")
sid = h5r.get_region(arg, dataset.id)
if shape != sid.shape:
raise TypeError("Reference shape does not match dataset shape")
return Selection(shape, spaceid=sid)
if dataset is not None:
selector = dataset._selector
else:
space = h5s.create_simple(shape)
selector = _selector.Selector(space)
return selector.make_selection(args)
class Selection:
"""
Base class for HDF5 dataspace selections. Subclasses support the
"selection protocol", which means they have at least the following
members:
__init__(shape) => Create a new selection on "shape"-tuple
__getitem__(args) => Perform a selection with the range specified.
What args are allowed depends on the
particular subclass in use.
id (read-only) => h5py.h5s.SpaceID instance
shape (read-only) => The shape of the dataspace.
mshape (read-only) => The shape of the selection region.
Not guaranteed to fit within "shape", although
the total number of points is less than
product(shape).
nselect (read-only) => Number of selected points. Always equal to
product(mshape).
broadcast(target_shape) => Return an iterable which yields dataspaces
for read, based on target_shape.
The base class represents "unshaped" selections (1-D).
"""
def __init__(self, shape, spaceid=None):
""" Create a selection. Shape may be None if spaceid is given. """
if spaceid is not None:
self._id = spaceid
self._shape = spaceid.shape
else:
shape = tuple(shape)
self._shape = shape
self._id = h5s.create_simple(shape, (h5s.UNLIMITED,)*len(shape))
self._id.select_all()
@property
def id(self):
""" SpaceID instance """
return self._id
@property
def shape(self):
""" Shape of whole dataspace """
return self._shape
@property
def nselect(self):
""" Number of elements currently selected """
return self._id.get_select_npoints()
@property
def mshape(self):
""" Shape of selection (always 1-D for this class) """
return (self.nselect,)
@property
def array_shape(self):
"""Shape of array to read/write (always 1-D for this class)"""
return self.mshape
# expand_shape and broadcast only really make sense for SimpleSelection
def expand_shape(self, source_shape):
if product(source_shape) != self.nselect:
raise TypeError("Broadcasting is not supported for point-wise selections")
return source_shape
def broadcast(self, source_shape):
""" Get an iterable for broadcasting """
if product(source_shape) != self.nselect:
raise TypeError("Broadcasting is not supported for point-wise selections")
yield self._id
def __getitem__(self, args):
raise NotImplementedError("This class does not support indexing")
class PointSelection(Selection):
"""
Represents a point-wise selection. You can supply sequences of
points to the three methods append(), prepend() and set(), or
instantiate it with a single boolean array using from_mask().
"""
def __init__(self, shape, spaceid=None, points=None):
super().__init__(shape, spaceid)
if points is not None:
self._perform_selection(points, h5s.SELECT_SET)
def _perform_selection(self, points, op):
""" Internal method which actually performs the selection """
points = np.asarray(points, order='C', dtype='u8')
if len(points.shape) == 1:
points.shape = (1,points.shape[0])
if self._id.get_select_type() != h5s.SEL_POINTS:
op = h5s.SELECT_SET
if len(points) == 0:
self._id.select_none()
else:
self._id.select_elements(points, op)
@classmethod
def from_mask(cls, mask, spaceid=None):
"""Create a point-wise selection from a NumPy boolean array """
if not (isinstance(mask, np.ndarray) and mask.dtype.kind == 'b'):
raise TypeError("PointSelection.from_mask only works with bool arrays")
points = np.transpose(mask.nonzero())
return cls(mask.shape, spaceid, points=points)
def append(self, points):
""" Add the sequence of points to the end of the current selection """
self._perform_selection(points, h5s.SELECT_APPEND)
def prepend(self, points):
""" Add the sequence of points to the beginning of the current selection """
self._perform_selection(points, h5s.SELECT_PREPEND)
def set(self, points):
""" Replace the current selection with the given sequence of points"""
self._perform_selection(points, h5s.SELECT_SET)
class SimpleSelection(Selection):
""" A single "rectangular" (regular) selection composed of only slices
and integer arguments. Can participate in broadcasting.
"""
@property
def mshape(self):
""" Shape of current selection """
return self._sel[1]
@property
def array_shape(self):
scalar = self._sel[3]
return tuple(x for x, s in zip(self.mshape, scalar, strict=True) if not s)
def __init__(self, shape, spaceid=None, hyperslab=None):
super().__init__(shape, spaceid)
if hyperslab is not None:
self._sel = hyperslab
else:
# No hyperslab specified - select all
rank = len(self.shape)
self._sel = ((0,)*rank, self.shape, (1,)*rank, (False,)*rank)
def expand_shape(self, source_shape):
"""Match the dimensions of an array to be broadcast to the selection
The returned shape describes an array of the same size as the input
shape, but its dimensions
E.g. with a dataset shape (10, 5, 4, 2), writing like this::
ds[..., 0] = np.ones((5, 4))
The source shape (5, 4) will expand to (1, 5, 4, 1).
Then the broadcast method below repeats that chunk 10
times to write to an effective shape of (10, 5, 4, 1).
"""
start, count, step, scalar = self._sel
rank = len(count)
remaining_src_dims = list(source_shape)
eshape = []
for idx in range(1, rank + 1):
if len(remaining_src_dims) == 0 or scalar[-idx]: # Skip scalar axes
eshape.append(1)
else:
t = remaining_src_dims.pop()
if t == 1 or count[-idx] == t:
eshape.append(t)
else:
raise TypeError("Can't broadcast %s -> %s" % (source_shape, self.array_shape)) # array shape
if any([n > 1 for n in remaining_src_dims]):
# All dimensions from target_shape should either have been popped
# to match the selection shape, or be 1.
raise TypeError("Can't broadcast %s -> %s" % (source_shape, self.array_shape)) # array shape
# We have built eshape backwards, so now reverse it
return tuple(eshape[::-1])
def broadcast(self, source_shape):
""" Return an iterator over target dataspaces for broadcasting.
Follows the standard NumPy broadcasting rules against the current
selection shape (self.mshape).
"""
if self.shape == ():
if product(source_shape) != 1:
raise TypeError("Can't broadcast %s to scalar" % source_shape)
self._id.select_all()
yield self._id
return
start, count, step, scalar = self._sel
rank = len(count)
tshape = self.expand_shape(source_shape)
# Avoid ZeroDivisionError below (after the shape checks in expand_source)
if any(d == 0 for d in count):
return
chunks = tuple(x//y for x, y in zip(count, tshape, strict=True))
nchunks = product(chunks)
if nchunks == 1:
yield self._id
else:
sid = self._id.copy()
sid.select_hyperslab((0,)*rank, tshape, step)
for idx in range(nchunks):
offset = tuple(x*y*z + s for x, y, z, s in zip(np.unravel_index(idx, chunks), tshape, step, start, strict=True))
sid.offset_simple(offset)
yield sid
class FancySelection(Selection):
"""
Implements advanced NumPy-style selection operations in addition to
the standard slice-and-int behavior.
Indexing arguments may be ints, slices, lists of indices, or
per-axis (1D) boolean arrays.
Broadcasting is not supported for these selections.
"""
@property
def mshape(self):
return self._mshape
@property
def array_shape(self):
return self._array_shape
def __init__(self, shape, spaceid=None, mshape=None, array_shape=None):
super().__init__(shape, spaceid)
if mshape is None:
mshape = self.shape
if array_shape is None:
array_shape = mshape
self._mshape = mshape
self._array_shape = array_shape
def expand_shape(self, source_shape):
if not source_shape == self.array_shape:
raise TypeError("Broadcasting is not supported for complex selections")
return source_shape
def broadcast(self, source_shape):
if not source_shape == self.array_shape:
raise TypeError("Broadcasting is not supported for complex selections")
yield self._id
def guess_shape(sid):
""" Given a dataspace, try to deduce the shape of the selection.
Returns one of:
* A tuple with the selection shape, same length as the dataspace
* A 1D selection shape for point-based and multiple-hyperslab selections
* None, for unselected scalars and for NULL dataspaces
"""
sel_class = sid.get_simple_extent_type() # Dataspace class
sel_type = sid.get_select_type() # Flavor of selection in use
if sel_class == h5s.NULL:
# NULL dataspaces don't support selections
return None
elif sel_class == h5s.SCALAR:
# NumPy has no way of expressing empty 0-rank selections, so we use None
if sel_type == h5s.SEL_NONE: return None
if sel_type == h5s.SEL_ALL: return tuple()
elif sel_class != h5s.SIMPLE:
raise TypeError("Unrecognized dataspace class %s" % sel_class)
# We have a "simple" (rank >= 1) dataspace
N = sid.get_select_npoints()
rank = len(sid.shape)
if sel_type == h5s.SEL_NONE:
return (0,)*rank
elif sel_type == h5s.SEL_ALL:
return sid.shape
elif sel_type == h5s.SEL_POINTS:
# Like NumPy, point-based selections yield 1D arrays regardless of
# the dataspace rank
return (N,)
elif sel_type != h5s.SEL_HYPERSLABS:
raise TypeError("Unrecognized selection method %s" % sel_type)
# We have a hyperslab-based selection
if N == 0:
return (0,)*rank
bottomcorner, topcorner = (np.array(x) for x in sid.get_select_bounds())
# Shape of full selection box
boxshape = topcorner - bottomcorner + np.ones((rank,))
def get_n_axis(sid, axis):
""" Determine the number of elements selected along a particular axis.
To do this, we "mask off" the axis by making a hyperslab selection
which leaves only the first point along the axis. For a 2D dataset
with selection box shape (X, Y), for axis 1, this would leave a
selection of shape (X, 1). We count the number of points N_leftover
remaining in the selection and compute the axis selection length by
N_axis = N/N_leftover.
"""
if(boxshape[axis]) == 1:
return 1
start = bottomcorner.copy()
start[axis] += 1
count = boxshape.copy()
count[axis] -= 1
# Throw away all points along this axis
masked_sid = sid.copy()
masked_sid.select_hyperslab(tuple(start), tuple(count), op=h5s.SELECT_NOTB)
N_leftover = masked_sid.get_select_npoints()
return N//N_leftover
shape = tuple(get_n_axis(sid, x) for x in range(rank))
if product(shape) != N:
# This means multiple hyperslab selections are in effect,
# so we fall back to a 1D shape
return (N,)
return shape
@@ -0,0 +1,103 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
Implements a portion of the selection operations.
"""
import numpy as np
from .. import h5s
def read_dtypes(dataset_dtype, names):
""" Returns a 2-tuple containing:
1. Output dataset dtype
2. Dtype containing HDF5-appropriate description of destination
"""
if len(names) == 0: # Not compound, or all fields needed
format_dtype = dataset_dtype
elif dataset_dtype.names is None:
raise ValueError("Field names only allowed for compound types")
elif any(x not in dataset_dtype.names for x in names):
raise ValueError("Field does not appear in this type.")
else:
format_dtype = np.dtype([(name, dataset_dtype.fields[name][0]) for name in names])
if len(names) == 1:
# We don't preserve the field information if only one explicitly selected.
output_dtype = format_dtype.fields[names[0]][0]
else:
output_dtype = format_dtype
return output_dtype, format_dtype
def read_selections_scalar(dsid, args):
""" Returns a 2-tuple containing:
1. Output dataset shape
2. HDF5 dataspace containing source selection.
Works for scalar datasets.
"""
if dsid.shape != ():
raise RuntimeError("Illegal selection function for non-scalar dataset")
if args == ():
# This is a signal that an array scalar should be returned instead
# of an ndarray with shape ()
out_shape = None
elif args == (Ellipsis,):
out_shape = ()
else:
raise ValueError("Illegal slicing argument for scalar dataspace")
source_space = dsid.get_space()
source_space.select_all()
return out_shape, source_space
class ScalarReadSelection:
"""
Implements slicing for scalar datasets.
"""
def __init__(self, fspace, args):
if args == ():
self.mshape = None
elif args == (Ellipsis,):
self.mshape = ()
else:
raise ValueError("Illegal slicing argument for scalar dataspace")
self.mspace = h5s.create(h5s.SCALAR)
self.fspace = fspace
def __iter__(self):
self.mspace.select_all()
yield self.fspace, self.mspace
def select_read(fspace, args):
""" Top-level dispatch function for reading.
At the moment, only supports reading from scalar datasets.
"""
if fspace.shape == ():
return ScalarReadSelection(fspace, args)
raise NotImplementedError()
@@ -0,0 +1,250 @@
# This file is part of h5py, a Python interface to the HDF5 library.
#
# http://www.h5py.org
#
# Copyright 2008-2013 Andrew Collette and contributors
#
# License: Standard 3-clause BSD; see "license.txt" for full license terms
# and contributor agreement.
"""
High-level interface for creating HDF5 virtual datasets
"""
from copy import deepcopy as copy
from collections import namedtuple
import numpy as np
from .compat import filename_encode
from .datatype import Datatype
from .selections import SimpleSelection, select
from .. import h5d, h5p, h5s, h5t
class VDSmap(namedtuple('VDSmap', ('vspace', 'file_name',
'dset_name', 'src_space'))):
'''Defines a region in a virtual dataset mapping to part of a source dataset
'''
vds_support = True
def _convert_space_for_key(space, key):
"""
Converts the space with the given key. Mainly used to allow unlimited
dimensions in virtual space selection.
"""
key = key if isinstance(key, tuple) else (key,)
type_code = space.get_select_type()
# check for unlimited selections in case where selection is regular
# hyperslab, which is the only allowed case for h5s.UNLIMITED to be
# in the selection
if type_code == h5s.SEL_HYPERSLABS and space.is_regular_hyperslab():
rank = space.get_simple_extent_ndims()
nargs = len(key)
idx_offset = 0
start, stride, count, block = space.get_regular_hyperslab()
# iterate through keys. we ignore numeral indices. if we get a
# slice, we check for an h5s.UNLIMITED value as the stop
# if we get an ellipsis, we offset index by (rank - nargs)
for i, sl in enumerate(key):
if isinstance(sl, slice):
if sl.stop == h5s.UNLIMITED:
counts = list(count)
idx = i + idx_offset
counts[idx] = h5s.UNLIMITED
count = tuple(counts)
elif sl is Ellipsis:
idx_offset = rank - nargs
space.select_hyperslab(start, count, stride, block)
class VirtualSource:
"""Source definition for virtual data sets.
Instantiate this class to represent an entire source dataset, and then
slice it to indicate which regions should be used in the virtual dataset.
path_or_dataset
The path to a file, or an h5py dataset. If a dataset is given,
no other parameters are allowed, as the relevant values are taken from
the dataset instead.
name
The name of the source dataset within the file.
shape
A tuple giving the shape of the dataset.
dtype
Numpy dtype or string.
maxshape
The source dataset is resizable up to this shape. Use None for
axes you want to be unlimited.
"""
def __init__(self, path_or_dataset, name=None,
shape=None, dtype=None, maxshape=None):
from .dataset import Dataset
if isinstance(path_or_dataset, Dataset):
failed = {k: v
for k, v in
{'name': name, 'shape': shape,
'dtype': dtype, 'maxshape': maxshape}.items()
if v is not None}
if failed:
raise TypeError("If a Dataset is passed as the first argument "
"then no other arguments may be passed. You "
"passed {failed}".format(failed=failed))
ds = path_or_dataset
path = ds.file.filename
name = ds.name
shape = ds.shape
dtype = ds.dtype
maxshape = ds.maxshape
else:
path = path_or_dataset
if name is None:
raise TypeError("The name parameter is required when "
"specifying a source by path")
if shape is None:
raise TypeError("The shape parameter is required when "
"specifying a source by path")
elif isinstance(shape, int):
shape = (shape,)
if isinstance(maxshape, int):
maxshape = (maxshape,)
self.path = path
self.name = name
self.dtype = dtype
if maxshape is None:
self.maxshape = shape
else:
self.maxshape = tuple([h5s.UNLIMITED if ix is None else ix
for ix in maxshape])
self.sel = SimpleSelection(shape)
self._all_selected = True
@property
def shape(self):
return self.sel.array_shape
def __getitem__(self, key):
if not self._all_selected:
raise RuntimeError("VirtualSource objects can only be sliced once.")
tmp = copy(self)
tmp.sel = select(self.shape, key, dataset=None)
_convert_space_for_key(tmp.sel.id, key)
tmp._all_selected = False
return tmp
class VirtualLayout:
"""Object for building a virtual dataset.
Instantiate this class to define a virtual dataset, assign to slices of it
(using VirtualSource objects), and then pass it to
group.create_virtual_dataset() to add the virtual dataset to a file.
This class does not allow access to the data; the virtual dataset must
be created in a file before it can be used.
shape
A tuple giving the shape of the dataset.
dtype
Numpy dtype or string.
maxshape
The virtual dataset is resizable up to this shape. Use None for
axes you want to be unlimited.
filename
The name of the destination file, if known in advance. Mappings from
data in the same file will be stored with filename '.', allowing the
file to be renamed later.
"""
def __init__(self, shape, dtype, maxshape=None, filename=None):
self.shape = (shape,) if isinstance(shape, int) else shape
self.dtype = dtype
self.maxshape = (maxshape,) if isinstance(maxshape, int) else maxshape
self._filename = filename
self._src_filenames = set()
self.dcpl = h5p.create(h5p.DATASET_CREATE)
self.dcpl.set_layout(h5d.VIRTUAL)
def __setitem__(self, key, source):
sel = select(self.shape, key, dataset=None)
_convert_space_for_key(sel.id, key)
src_filename = self._source_file_name(source.path, self._filename)
self.dcpl.set_virtual(
sel.id, src_filename, source.name.encode('utf-8'), source.sel.id
)
if self._filename is None:
self._src_filenames.add(src_filename)
@staticmethod
def _source_file_name(src_filename, dst_filename) -> bytes:
src_filename = filename_encode(src_filename)
if dst_filename and (src_filename == filename_encode(dst_filename)):
# use relative path if the source dataset is in the same
# file, in order to keep the virtual dataset valid in case
# the file is renamed.
return b'.'
return filename_encode(src_filename)
def _get_dcpl(self, dst_filename):
"""Get the property list containing virtual dataset mappings
If the destination filename wasn't known when the VirtualLayout was
created, it is handled here.
"""
dst_filename = filename_encode(dst_filename)
if self._filename is not None:
# filename was known in advance; check dst_filename matches
if dst_filename != filename_encode(self._filename):
raise Exception(f"{dst_filename!r} != {self._filename!r}")
return self.dcpl
# destination file not known in advance
if dst_filename in self._src_filenames:
# At least 1 source file is the same as the destination file,
# but we didn't know this when making the mapping. Copy the mappings
# to a new property list, replacing the dest filename with '.'
new_dcpl = h5p.create(h5p.DATASET_CREATE)
new_dcpl.set_layout(h5d.VIRTUAL)
for i in range(self.dcpl.get_virtual_count()):
src_filename = self.dcpl.get_virtual_filename(i)
new_dcpl.set_virtual(
self.dcpl.get_virtual_vspace(i),
self._source_file_name(src_filename, dst_filename),
self.dcpl.get_virtual_dsetname(i).encode('utf-8'),
self.dcpl.get_virtual_srcspace(i),
)
return new_dcpl
else:
return self.dcpl # Mappings are all from other files
def make_dataset(self, parent, name, fillvalue=None):
""" Return a new low-level dataset identifier for a virtual dataset """
dcpl = self._get_dcpl(parent.file.filename)
if fillvalue is not None:
dcpl.set_fill_value(np.array([fillvalue]))
maxshape = self.maxshape
if maxshape is not None:
maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
virt_dspace = h5s.create_simple(self.shape, maxshape)
if isinstance(self.dtype, Datatype):
# Named types are used as-is
tid = self.dtype.id
else:
dtype = np.dtype(self.dtype)
tid = h5t.py_create(dtype, logical=1)
return h5d.create(parent.id, name=name, tid=tid, space=virt_dspace,
dcpl=dcpl)