diff --git a/sasdata/data.py b/sasdata/data.py index fd294a43..788f3d7f 100644 --- a/sasdata/data.py +++ b/sasdata/data.py @@ -69,7 +69,7 @@ def abscissae(self) -> Quantity: # TODO: Won't work when there's errors involved. On reflection, we # probably want to avoid creating a new Quantity but at the moment I # can't see a way around it. - return Quantity(data_contents, reference_data_content.units) + return Quantity(data_contents, reference_data_content.units, name=self._data_contents["Qx"].name, id_header=self._data_contents["Qx"]._id_header) case dataset_types.sesans: return self._data_contents["SpinEchoLength"] case _: diff --git a/sasdata/metadata.py b/sasdata/metadata.py index 5271fb53..d53c3102 100644 --- a/sasdata/metadata.py +++ b/sasdata/metadata.py @@ -567,6 +567,14 @@ def from_json(obj): raw=MetaNode.from_json(obj["raw"]), ) + @property + def id_header(self): + """Generate a header for used in the unique_id for datasets""" + title = "" + if self.title is not None: + title = self.title + return f"{title}:{",".join(self.run)}" + def as_h5(self, f: h5py.Group): """Export data onto an HDF5 group""" for idx, run in enumerate(self.run): diff --git a/sasdata/quantities/quantity.py b/sasdata/quantities/quantity.py index 7cc8f853..9829338b 100644 --- a/sasdata/quantities/quantity.py +++ b/sasdata/quantities/quantity.py @@ -1082,14 +1082,13 @@ def summary(self): class Quantity[QuantityType]: - - def __init__(self, value: QuantityType, units: Unit, standard_error: QuantityType | None = None, - hash_seed = ""): - + hash_seed="", + name="", + id_header=""): self.value = value """ Numerical value of this data, in the specified units""" @@ -1113,6 +1112,9 @@ def __init__(self, self.history = QuantityHistory.variable(self) + self._id_header = id_header + self.name = name + # TODO: Adding this method as a temporary measure but we need a single # method that does this. def with_standard_error(self, standard_error: "Quantity"): @@ -1120,7 +1122,10 @@ def with_standard_error(self, standard_error: "Quantity"): return Quantity( value=self.value, units=self.units, - standard_error=standard_error.in_units_of(self.units),) + standard_error=standard_error.in_units_of(self.units), + name=self.name, + id_header=self._id_header, + ) else: raise UnitError(f"Standard error units ({standard_error.units}) " f"are not compatible with value units ({self.units})") @@ -1133,10 +1138,30 @@ def has_variance(self): def variance(self) -> "Quantity": """ Get the variance of this object""" if self._variance is None: - return Quantity(np.zeros_like(self.value), self.units**2) + return Quantity(np.zeros_like(self.value), self.units**2, name=self.name, id_header=self._id_header) else: return Quantity(self._variance, self.units**2) + def _base62_hash(self) -> str: + """Encode the hash_value in base62 for better readability""" + hashed = "" + current_hash = self.hash_value + while current_hash: + digit = current_hash % 62 + if digit < 10: + hashed = f"{digit}{hashed}" + elif digit < 36: + hashed = f"{chr(55 + digit)}{hashed}" + else: + hashed = f"{chr(61 + digit)}{hashed}" + current_hash = (current_hash - digit) // 62 + return hashed + + @property + def unique_id(self) -> str: + """Get a human readable unique id for a data set""" + return f"{self._id_header}:{self.name}:{self._base62_hash()}" + def standard_deviation(self) -> "Quantity": return self.variance ** 0.5 @@ -1152,7 +1177,8 @@ def to_units_of(self, new_units: Unit) -> "Quantity[QuantityType]": return Quantity(value=new_value, units=new_units, standard_error=new_error, - hash_seed=self._hash_seed) + hash_seed=self._hash_seed, + id_header=self._id_header) def variance_in_units_of(self, units: Unit) -> QuantityType: """ Get the variance of quantity in other units """ @@ -1411,10 +1437,9 @@ def __init__(self, name: str, value: QuantityType, units: Unit, - standard_error: QuantityType | None = None): - - super().__init__(value, units, standard_error=standard_error, hash_seed=name) - self.name = name + standard_error: QuantityType | None = None, + id_header=""): + super().__init__(value, units, standard_error=standard_error, hash_seed=name, name=name, id_header=id_header) def __repr__(self): return f"[{self.name}] " + super().__repr__() @@ -1432,7 +1457,9 @@ def with_standard_error(self, standard_error: Quantity): value=self.value, units=self.units, standard_error=standard_error.in_units_of(self.units), - name=self.name) + name=self.name, + id_header=self._id_header, + ) else: raise UnitError(f"Standard error units ({standard_error.units}) " diff --git a/sasdata/temp_ascii_reader.py b/sasdata/temp_ascii_reader.py index d8f726f4..96e8634b 100644 --- a/sasdata/temp_ascii_reader.py +++ b/sasdata/temp_ascii_reader.py @@ -21,7 +21,7 @@ guess_starting_position, ) from sasdata.metadata import Metadata, MetaNode -from sasdata.quantities.quantity import Quantity +from sasdata.quantities.quantity import NamedQuantity, Quantity from sasdata.quantities.units import NamedUnit @@ -121,7 +121,7 @@ def split_line(separator_dict: dict[str, bool], line: str) -> list[str]: # TODO: Implement error handling. -def load_quantities(params: AsciiReaderParams, filename: str) -> dict[str, Quantity]: +def load_quantities(params: AsciiReaderParams, filename: str, metadata: Metadata) -> dict[str, Quantity]: """Load a list of quantities from the filename based on the params.""" with open(filename) as ascii_file: lines = ascii_file.readlines() @@ -146,7 +146,7 @@ def load_quantities(params: AsciiReaderParams, filename: str) -> dict[str, Quant print(f"Line {i + 1} skipped.") continue file_quantities = { - name: Quantity(arrays[i], unit) + name: NamedQuantity(name, arrays[i], unit, id_header=metadata.id_header) for i, (name, unit) in enumerate(params.columns_included) } return file_quantities @@ -194,7 +194,6 @@ def load_data(params: AsciiReaderParams) -> list[SasData]: list contained in the params.""" loaded_data: list[SasData] = [] for filename in params.filenames: - quantities = load_quantities(params, filename) raw_metadata = import_metadata( params.metadata.all_file_metadata(path.basename(filename)) ) @@ -207,6 +206,7 @@ def load_data(params: AsciiReaderParams) -> list[SasData]: process=None, raw=raw_metadata, ) + quantities = load_quantities(params, filename, metadata) data = SasData( path.basename(filename), merge_uncertainties(quantities), diff --git a/sasdata/temp_hdf5_reader.py b/sasdata/temp_hdf5_reader.py index af4ee380..e439486a 100644 --- a/sasdata/temp_hdf5_reader.py +++ b/sasdata/temp_hdf5_reader.py @@ -72,7 +72,7 @@ def recurse_hdf5(hdf5_entry): GET_UNITS_FROM_ELSEWHERE = units.meters -def connected_data(node: SASDataGroup, name_prefix="") -> dict[str, Quantity]: +def connected_data(node: SASDataGroup, name_prefix="", metadata=None) -> dict[str, Quantity]: """In the context of NeXus files, load a group of data entries that are organised together match up the units and errors with their values""" # Gather together data with its error terms @@ -89,9 +89,7 @@ def connected_data(node: SASDataGroup, name_prefix="") -> dict[str, Quantity]: else: units = GET_UNITS_FROM_ELSEWHERE - quantity = NamedQuantity( - name=name_prefix + child.name, value=child.data, units=units - ) + quantity = NamedQuantity(name=child.name, value=child.data, units=units, id_header=metadata.id_header) # Turns out people can't be trusted to use the same keys here if "uncertainty" in child.attributes or "uncertainties" in child.attributes: @@ -368,7 +366,7 @@ def load_raw(node: HDF5Group | HDF5Dataset) -> MetaNode: else: if "units" in attrib and attrib["units"]: data = node[()] if node.shape == () else node[:] - contents = Quantity(data, parse(attrib["units"])) + contents = Quantity(data, parse(attrib["units"]), id_header=node.name) else: contents = node[()] if node.shape == () else node[:] return MetaNode(name=name, attrs=attrib, contents=contents) @@ -424,13 +422,13 @@ def load_data(filename: str) -> dict[str, SasData]: logger.warning("No sasdata or data key") logger.warning(f"Known keys: {[k for k in entry_keys]}") + metadata = parse_metadata(f[root_key]) + for key in entry_keys: component = entry[key] if get_canSAS_class(entry[key])=='SASdata': datum = recurse_hdf5(component) - data_contents = connected_data(datum, str(filename)) - - metadata = parse_metadata(f[root_key]) + data_contents = connected_data(datum, str(filename), metadata) if "Qz" in data_contents: dataset_type = three_dim diff --git a/sasdata/temp_xml_reader.py b/sasdata/temp_xml_reader.py index 5296de01..174fe738 100644 --- a/sasdata/temp_xml_reader.py +++ b/sasdata/temp_xml_reader.py @@ -21,7 +21,7 @@ Source, Vec3, ) -from sasdata.quantities.quantity import Quantity +from sasdata.quantities.quantity import NamedQuantity, Quantity from sasdata.quantities.units import Unit from sasdata.quantities.units import none as unitless @@ -205,7 +205,7 @@ def parse_sample(node: etree._Element, version: str) -> Sample: ) -def parse_data(node: etree._Element, version: str) -> dict[str, Quantity]: +def parse_data(node: etree._Element, version: str, metadata: Metadata) -> dict[str, Quantity]: """Parse scattering data""" aos = [] keys = set() @@ -244,7 +244,7 @@ def parse_data(node: etree._Element, version: str) -> dict[str, Quantity]: result: dict[str, Quantity] = {} for k in keys: - result[k] = Quantity(np.array(soa[k]), us[k]) + result[k] = NamedQuantity(k, np.array(soa[k]), us[k], id_header=metadata.id_header) if k + "dev" in uncertainties: result[k] = result[k].with_standard_error( Quantity(np.array(soa[k + "dev"]), us[k + "dev"]) @@ -323,7 +323,7 @@ def load_data(filename: str) -> dict[str, SasData]: datacount = 0 for n in entry.findall(f"{version}:SASdata", ns): datacount += 1 - data_set = parse_data(n, version) + data_set = parse_data(n, version, metadata) data = data_set break diff --git a/test/sasdataloader/utest_data_names.py b/test/sasdataloader/utest_data_names.py new file mode 100644 index 00000000..11dcb677 --- /dev/null +++ b/test/sasdataloader/utest_data_names.py @@ -0,0 +1,43 @@ +""" +Tests for generation of unique, but reproducible, names for data quantities +""" + +import os + +import pytest + +from sasdata.data import SasData +from sasdata.temp_ascii_reader import load_data_default_params +from sasdata.temp_hdf5_reader import load_data as hdf_load_data +from sasdata.temp_xml_reader import load_data as xml_load_data + + +def local_load(path: str) -> SasData: + """Get local file path""" + base = os.path.join(os.path.dirname(__file__), path) + if os.path.exists(f"{base}.h5"): + return hdf_load_data(f"{base}.h5").values() + if os.path.exists(f"{base}.xml"): + return xml_load_data(f"{base}.xml").values() + if os.path.exists(f"{base}.txt"): + return load_data_default_params(f"{base}.txt") + assert False + + +test_file_names = [ + ("ascii_test_1", "::Q:3KrS58TPgclJ1rgyr0VQp3"), + ("ISIS_1_1", "TK49 c10_SANS:79680:Q:4TghWEoJi6xxhyeDXhS751"), + ("cansas1d", "Test title:1234:Q:440tNBqdx9jvci6CgjmrmD"), + ("MAR07232_rest", "MAR07232_rest_out.dat:2:/sasentry01/sasdata01/Qx:2Y0qTTb054KSJnJaJv0rFl"), + ("simpleexamplefile", "::/sasentry01/sasdata01/Q:uoHMeB8mukElC1uLCy7Sd"), +] + + +@pytest.mark.names +@pytest.mark.parametrize("x", test_file_names) +def test_quantity_name(x): + (f, expected) = x + data = [v for v in local_load(f"data/{f}")][0] + if data.metadata.title is not None: + assert data.abscissae.unique_id.startswith(data.metadata.title) + assert data.abscissae.unique_id == expected