Merge pull request #1899 from claire-simpson/feature/1774-datastack-metadata
Generate/copy metadata when creating a datastack
This commit is contained in:
commit
db4115fa33
13
HISTORY.rst
13
HISTORY.rst
|
@ -60,9 +60,16 @@
|
|||
6. InVEST model Z (model names should be sorted A-Z)
|
||||
|
||||
|
||||
..
|
||||
Unreleased Changes
|
||||
------------------
|
||||
|
||||
Unreleased Changes
|
||||
------------------
|
||||
|
||||
Workbench
|
||||
=========
|
||||
* Metadata is now generated for files when creating a datastack (with any
|
||||
existing user-added metadata preserved)
|
||||
(`#1774 <https://github.com/natcap/invest/issues/1774>`_).
|
||||
|
||||
|
||||
3.15.1 (2025-05-06)
|
||||
-------------------
|
||||
|
|
|
@ -472,7 +472,8 @@ def main(user_args=None):
|
|||
try:
|
||||
# If there's an exception from creating metadata
|
||||
# I don't think we want to indicate a model failure
|
||||
spec_utils.generate_metadata(model_module, parsed_datastack.args)
|
||||
spec_utils.generate_metadata_for_outputs(
|
||||
model_module, parsed_datastack.args)
|
||||
except Exception as exc:
|
||||
LOGGER.warning(
|
||||
'Something went wrong while generating metadata', exc_info=exc)
|
||||
|
|
|
@ -34,6 +34,7 @@ import warnings
|
|||
|
||||
from osgeo import gdal
|
||||
|
||||
from . import spec_utils
|
||||
from . import utils
|
||||
from . import validation
|
||||
|
||||
|
@ -399,8 +400,6 @@ def build_datastack_archive(args, model_name, datastack_path):
|
|||
target_filepath = os.path.join(
|
||||
data_dir, f'{key}_file')
|
||||
shutil.copyfile(source_path, target_filepath)
|
||||
LOGGER.debug(
|
||||
f'File copied from {source_path} --> {target_filepath}')
|
||||
target_arg_value = target_filepath
|
||||
files_found[source_path] = target_arg_value
|
||||
|
||||
|
@ -453,9 +452,20 @@ def build_datastack_archive(args, model_name, datastack_path):
|
|||
# write parameters to a new json file in the temp workspace
|
||||
param_file_uri = os.path.join(temp_workspace,
|
||||
'parameters' + PARAMETER_SET_EXTENSION)
|
||||
build_parameter_set(
|
||||
parameter_set = build_parameter_set(
|
||||
rewritten_args, model_name, param_file_uri, relative=True)
|
||||
|
||||
# write metadata for all files in args
|
||||
keywords = [module.MODEL_SPEC['model_id'], 'InVEST']
|
||||
for k, v in args.items():
|
||||
if isinstance(v, str) and os.path.isfile(v):
|
||||
this_arg_spec = module.MODEL_SPEC['args'][k]
|
||||
# write metadata file to target location (in temp dir)
|
||||
subdir = os.path.dirname(parameter_set['args'][k])
|
||||
target_location = os.path.join(temp_workspace, subdir)
|
||||
spec_utils.write_metadata_file(v, this_arg_spec, keywords,
|
||||
out_workspace=target_location)
|
||||
|
||||
# Remove the handler before archiving the working dir (and the logfile)
|
||||
archive_filehandler.close()
|
||||
logging.getLogger().removeHandler(archive_filehandler)
|
||||
|
@ -534,7 +544,7 @@ def build_parameter_set(args, model_name, paramset_path, relative=False):
|
|||
directory of ``paramset_path``.
|
||||
|
||||
Returns:
|
||||
``None``
|
||||
parameter dictionary saved in ``paramset_path``
|
||||
|
||||
Raises:
|
||||
ValueError if creating a relative path fails.
|
||||
|
@ -584,6 +594,8 @@ def build_parameter_set(args, model_name, paramset_path, relative=False):
|
|||
indent=4,
|
||||
sort_keys=True))
|
||||
|
||||
return parameter_data
|
||||
|
||||
|
||||
def extract_parameter_set(paramset_path):
|
||||
"""Extract and return attributes from a parameter set.
|
||||
|
|
|
@ -12,6 +12,8 @@ from natcap.invest import utils
|
|||
from . import gettext
|
||||
from .unit_registry import u
|
||||
|
||||
from pydantic import ValidationError
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
@ -613,26 +615,54 @@ def describe_arg_from_name(module_name, *arg_keys):
|
|||
return f'.. _{anchor_name}:\n\n{rst_description}'
|
||||
|
||||
|
||||
def write_metadata_file(datasource_path, spec, lineage_statement, keywords_list):
|
||||
"""Write a metadata sidecar file for an invest output dataset.
|
||||
def write_metadata_file(datasource_path, spec, keywords_list,
|
||||
lineage_statement='', out_workspace=None):
|
||||
"""Write a metadata sidecar file for an invest dataset.
|
||||
|
||||
Create metadata for invest model inputs or outputs, taking care to
|
||||
preserve existing human-modified attributes.
|
||||
|
||||
Note: We do not want to overwrite any existing metadata so if there is
|
||||
invalid metadata for the datasource (i.e., doesn't pass geometamaker
|
||||
validation in ``describe``), this function will NOT create new metadata.
|
||||
|
||||
Args:
|
||||
datasource_path (str) - filepath to the invest output
|
||||
spec (dict) - the invest specification for ``datasource_path``
|
||||
lineage_statement (str) - string to describe origin of the dataset.
|
||||
datasource_path (str) - filepath to the data to describe
|
||||
spec (dict) - the invest specification for ``datasource_path``
|
||||
keywords_list (list) - sequence of strings
|
||||
|
||||
lineage_statement (str, optional) - string to describe origin of
|
||||
the dataset
|
||||
out_workspace (str, optional) - where to write metadata if different
|
||||
from data location
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
resource = geometamaker.describe(datasource_path)
|
||||
|
||||
def _get_key(key, resource):
|
||||
"""Map name of actual key in yml from model_spec key name."""
|
||||
names = {field.name.lower(): field.name
|
||||
for field in resource.data_model.fields}
|
||||
return names[key]
|
||||
|
||||
try:
|
||||
resource = geometamaker.describe(datasource_path)
|
||||
except ValidationError:
|
||||
LOGGER.debug(
|
||||
f"Skipping metadata creation for {datasource_path}, as invalid "
|
||||
"metadata exists.")
|
||||
return None
|
||||
# Don't want function to fail bc can't create metadata due to invalid filetype
|
||||
except ValueError as e:
|
||||
LOGGER.debug(f"Skipping metadata creation for {datasource_path}: {e}")
|
||||
return None
|
||||
|
||||
resource.set_lineage(lineage_statement)
|
||||
# a pre-existing metadata doc could have keywords
|
||||
words = resource.get_keywords()
|
||||
resource.set_keywords(set(words + keywords_list))
|
||||
|
||||
if 'about' in spec:
|
||||
if 'about' in spec and len(resource.get_description()) < 1:
|
||||
resource.set_description(spec['about'])
|
||||
attr_spec = None
|
||||
if 'columns' in spec:
|
||||
|
@ -641,27 +671,38 @@ def write_metadata_file(datasource_path, spec, lineage_statement, keywords_list)
|
|||
attr_spec = spec['fields']
|
||||
if attr_spec:
|
||||
for key, value in attr_spec.items():
|
||||
about = value['about'] if 'about' in value else ''
|
||||
units = format_unit(value['units']) if 'units' in value else ''
|
||||
try:
|
||||
resource.set_field_description(
|
||||
key, description=about, units=units)
|
||||
# field names in attr_spec are always lowercase, but the
|
||||
# actual fieldname in the data could be any case because
|
||||
# invest does not require case-sensitive fieldnames
|
||||
yaml_key = _get_key(key, resource)
|
||||
# Field description only gets set if its empty, i.e. ''
|
||||
if len(resource.get_field_description(yaml_key)
|
||||
.description.strip()) < 1:
|
||||
about = value['about'] if 'about' in value else ''
|
||||
resource.set_field_description(yaml_key, description=about)
|
||||
# units only get set if empty
|
||||
if len(resource.get_field_description(yaml_key)
|
||||
.units.strip()) < 1:
|
||||
units = format_unit(value['units']) if 'units' in value else ''
|
||||
resource.set_field_description(yaml_key, units=units)
|
||||
except KeyError as error:
|
||||
# fields that are in the spec but missing
|
||||
# from model results because they are conditional.
|
||||
LOGGER.debug(error)
|
||||
if 'bands' in spec:
|
||||
for idx, value in spec['bands'].items():
|
||||
try:
|
||||
units = format_unit(spec['bands'][idx]['units'])
|
||||
except KeyError:
|
||||
units = ''
|
||||
resource.set_band_description(idx, units=units)
|
||||
if len(resource.get_band_description(idx).units) < 1:
|
||||
try:
|
||||
units = format_unit(spec['bands'][idx]['units'])
|
||||
except KeyError:
|
||||
units = ''
|
||||
resource.set_band_description(idx, units=units)
|
||||
|
||||
resource.write()
|
||||
resource.write(workspace=out_workspace)
|
||||
|
||||
|
||||
def generate_metadata(model_module, args_dict):
|
||||
def generate_metadata_for_outputs(model_module, args_dict):
|
||||
"""Create metadata for all items in an invest model output workspace.
|
||||
|
||||
Args:
|
||||
|
@ -695,7 +736,7 @@ def generate_metadata(model_module, args_dict):
|
|||
if os.path.exists(full_path):
|
||||
try:
|
||||
write_metadata_file(
|
||||
full_path, spec_data, lineage_statement, keywords)
|
||||
full_path, spec_data, keywords, lineage_statement)
|
||||
except ValueError as error:
|
||||
# Some unsupported file formats, e.g. html
|
||||
LOGGER.debug(error)
|
||||
|
|
|
@ -211,6 +211,60 @@ class DatastackArchiveTests(unittest.TestCase):
|
|||
|
||||
self.assertEqual(len(archived_params), 1) # sanity check
|
||||
|
||||
def test_datastack_metadata(self):
|
||||
"""Test correct metadata is created for datastack
|
||||
|
||||
Copy files into a temp directory, create metadata for 1 file
|
||||
|
||||
"""
|
||||
from natcap.invest import datastack
|
||||
import geometamaker
|
||||
|
||||
params = {
|
||||
'raster': os.path.join(DATA_DIR, "landcover.tif"),
|
||||
'simple_table': os.path.join(DATA_DIR, "carbon_pools_samp.csv"),
|
||||
}
|
||||
|
||||
# Copy params into new dir
|
||||
temp_dir = os.path.join(self.workspace, "temp_dir")
|
||||
os.mkdir(temp_dir)
|
||||
|
||||
for name, f in params.items():
|
||||
shutil.copyfile(f, os.path.join(temp_dir, os.path.basename(f)))
|
||||
|
||||
params = {k: os.path.join(temp_dir, os.path.basename(f))
|
||||
for k, f in params.items()}
|
||||
|
||||
# generate custom metadata for 1 file before building datastack
|
||||
resource = geometamaker.describe(params['raster'])
|
||||
resource.set_description("foo")
|
||||
resource.set_keywords(["bar"])
|
||||
resource.write()
|
||||
|
||||
archive_path = os.path.join(self.workspace, 'archive.invs.tar.gz')
|
||||
|
||||
datastack.build_datastack_archive(
|
||||
params, 'test_datastack_modules.archive_extraction', archive_path)
|
||||
|
||||
# extract the archive
|
||||
out_directory = os.path.join(self.workspace, 'extracted_archive')
|
||||
datastack._tarfile_safe_extract(archive_path, out_directory)
|
||||
|
||||
# validate metadata in directory to ensure 2 yamls exist
|
||||
files, messages = geometamaker.validate_dir(out_directory,
|
||||
recursive=True)
|
||||
self.assertEqual(len(files), 2)
|
||||
self.assertFalse(any(messages))
|
||||
|
||||
# test that custom description and keyword are not overwritten and new
|
||||
# keywords are added
|
||||
raster_path = os.path.join(out_directory, "data",
|
||||
"raster_raster", "landcover.tif")
|
||||
resource = geometamaker.describe(raster_path)
|
||||
self.assertEqual(resource.get_description(), "foo")
|
||||
self.assertCountEqual(resource.get_keywords(),
|
||||
["archive_extraction_model", "InVEST", "bar"])
|
||||
|
||||
def test_nonspatial_files(self):
|
||||
"""Datastack: test nonspatial files."""
|
||||
from natcap.invest import datastack
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
MODEL_SPEC = {
|
||||
'model_id': 'archive_extraction_model',
|
||||
'args': {
|
||||
'blank': {'type': 'freestyle_string'},
|
||||
'a': {'type': 'integer'},
|
||||
|
@ -6,7 +7,7 @@ MODEL_SPEC = {
|
|||
'c': {'type': 'freestyle_string'},
|
||||
'foo': {'type': 'file'},
|
||||
'bar': {'type': 'file'},
|
||||
'data_dir': {'type': 'directory'},
|
||||
'data_dir': {'type': 'directory', 'contents': {}},
|
||||
'raster': {'type': 'raster'},
|
||||
'vector': {'type': 'vector'},
|
||||
'simple_table': {'type': 'csv'},
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
MODEL_SPEC = {
|
||||
'model_id': 'duplicate_filepaths_model',
|
||||
'args': {
|
||||
'foo': {'type': 'file'},
|
||||
'bar': {'type': 'file'},
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
MODEL_SPEC = {
|
||||
'model_id': 'nonspatial_model',
|
||||
'args': {
|
||||
'some_file': {'type': 'file'},
|
||||
'data_dir': {'type': 'directory'},
|
||||
'data_dir': {'type': 'directory', 'contents': {}},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
MODEL_SPEC = {
|
||||
'model_id': 'raster_model',
|
||||
'args': {
|
||||
'raster': {'type': 'raster'},
|
||||
}
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
MODEL_SPEC = {
|
||||
'model_id': 'simple_model',
|
||||
'args': {
|
||||
'a': {'type': 'integer'},
|
||||
'b': {'type': 'freestyle_string'},
|
||||
'c': {'type': 'freestyle_string'},
|
||||
'd': {'type': 'freestyle_string'},
|
||||
'workspace_dir': {'type': 'directory'},
|
||||
'workspace_dir': {'type': 'directory', 'contents': {}},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
MODEL_SPEC = {
|
||||
'model_id': 'ui_parameters_model',
|
||||
'args': {
|
||||
'foo': {'type': 'freestyle_string'},
|
||||
'bar': {'type': 'freestyle_string'},
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
MODEL_SPEC = {
|
||||
'model_id': 'vector_model',
|
||||
'args': {
|
||||
'vector': {'type': 'vector'},
|
||||
}
|
||||
|
|
|
@ -338,7 +338,7 @@ class TestMetadataFromSpec(unittest.TestCase):
|
|||
"""Override tearDown function to remove temporary directory."""
|
||||
shutil.rmtree(self.workspace_dir)
|
||||
|
||||
def test_write_metadata(self):
|
||||
def test_write_metadata_for_outputs(self):
|
||||
"""Test writing metadata for an invest output workspace."""
|
||||
|
||||
# An example invest output spec
|
||||
|
@ -389,7 +389,7 @@ class TestMetadataFromSpec(unittest.TestCase):
|
|||
|
||||
args_dict = {'workspace_dir': self.workspace_dir}
|
||||
|
||||
spec_utils.generate_metadata(model_module, args_dict)
|
||||
spec_utils.generate_metadata_for_outputs(model_module, args_dict)
|
||||
files, messages = geometamaker.validate_dir(
|
||||
self.workspace_dir, recursive=True)
|
||||
self.assertEqual(len(files), 2)
|
||||
|
|
Loading…
Reference in New Issue