Merge pull request #1899 from claire-simpson/feature/1774-datastack-metadata

Generate/copy metadata when creating a datastack
This commit is contained in:
Dave Fisher 2025-05-07 05:52:56 -07:00 committed by GitHub
commit db4115fa33
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 155 additions and 33 deletions

View File

@ -60,9 +60,16 @@
6. InVEST model Z (model names should be sorted A-Z)
..
Unreleased Changes
------------------
Unreleased Changes
------------------
Workbench
=========
* Metadata is now generated for files when creating a datastack (with any
existing user-added metadata preserved)
(`#1774 <https://github.com/natcap/invest/issues/1774>`_).
3.15.1 (2025-05-06)
-------------------

View File

@ -472,7 +472,8 @@ def main(user_args=None):
try:
# If there's an exception from creating metadata
# I don't think we want to indicate a model failure
spec_utils.generate_metadata(model_module, parsed_datastack.args)
spec_utils.generate_metadata_for_outputs(
model_module, parsed_datastack.args)
except Exception as exc:
LOGGER.warning(
'Something went wrong while generating metadata', exc_info=exc)

View File

@ -34,6 +34,7 @@ import warnings
from osgeo import gdal
from . import spec_utils
from . import utils
from . import validation
@ -399,8 +400,6 @@ def build_datastack_archive(args, model_name, datastack_path):
target_filepath = os.path.join(
data_dir, f'{key}_file')
shutil.copyfile(source_path, target_filepath)
LOGGER.debug(
f'File copied from {source_path} --> {target_filepath}')
target_arg_value = target_filepath
files_found[source_path] = target_arg_value
@ -453,9 +452,20 @@ def build_datastack_archive(args, model_name, datastack_path):
# write parameters to a new json file in the temp workspace
param_file_uri = os.path.join(temp_workspace,
'parameters' + PARAMETER_SET_EXTENSION)
build_parameter_set(
parameter_set = build_parameter_set(
rewritten_args, model_name, param_file_uri, relative=True)
# write metadata for all files in args
keywords = [module.MODEL_SPEC['model_id'], 'InVEST']
for k, v in args.items():
if isinstance(v, str) and os.path.isfile(v):
this_arg_spec = module.MODEL_SPEC['args'][k]
# write metadata file to target location (in temp dir)
subdir = os.path.dirname(parameter_set['args'][k])
target_location = os.path.join(temp_workspace, subdir)
spec_utils.write_metadata_file(v, this_arg_spec, keywords,
out_workspace=target_location)
# Remove the handler before archiving the working dir (and the logfile)
archive_filehandler.close()
logging.getLogger().removeHandler(archive_filehandler)
@ -534,7 +544,7 @@ def build_parameter_set(args, model_name, paramset_path, relative=False):
directory of ``paramset_path``.
Returns:
``None``
parameter dictionary saved in ``paramset_path``
Raises:
ValueError if creating a relative path fails.
@ -584,6 +594,8 @@ def build_parameter_set(args, model_name, paramset_path, relative=False):
indent=4,
sort_keys=True))
return parameter_data
def extract_parameter_set(paramset_path):
"""Extract and return attributes from a parameter set.

View File

@ -12,6 +12,8 @@ from natcap.invest import utils
from . import gettext
from .unit_registry import u
from pydantic import ValidationError
LOGGER = logging.getLogger(__name__)
@ -613,26 +615,54 @@ def describe_arg_from_name(module_name, *arg_keys):
return f'.. _{anchor_name}:\n\n{rst_description}'
def write_metadata_file(datasource_path, spec, lineage_statement, keywords_list):
"""Write a metadata sidecar file for an invest output dataset.
def write_metadata_file(datasource_path, spec, keywords_list,
lineage_statement='', out_workspace=None):
"""Write a metadata sidecar file for an invest dataset.
Create metadata for invest model inputs or outputs, taking care to
preserve existing human-modified attributes.
Note: We do not want to overwrite any existing metadata so if there is
invalid metadata for the datasource (i.e., doesn't pass geometamaker
validation in ``describe``), this function will NOT create new metadata.
Args:
datasource_path (str) - filepath to the invest output
spec (dict) - the invest specification for ``datasource_path``
lineage_statement (str) - string to describe origin of the dataset.
datasource_path (str) - filepath to the data to describe
spec (dict) - the invest specification for ``datasource_path``
keywords_list (list) - sequence of strings
lineage_statement (str, optional) - string to describe origin of
the dataset
out_workspace (str, optional) - where to write metadata if different
from data location
Returns:
None
"""
resource = geometamaker.describe(datasource_path)
def _get_key(key, resource):
"""Map name of actual key in yml from model_spec key name."""
names = {field.name.lower(): field.name
for field in resource.data_model.fields}
return names[key]
try:
resource = geometamaker.describe(datasource_path)
except ValidationError:
LOGGER.debug(
f"Skipping metadata creation for {datasource_path}, as invalid "
"metadata exists.")
return None
# Don't want function to fail bc can't create metadata due to invalid filetype
except ValueError as e:
LOGGER.debug(f"Skipping metadata creation for {datasource_path}: {e}")
return None
resource.set_lineage(lineage_statement)
# a pre-existing metadata doc could have keywords
words = resource.get_keywords()
resource.set_keywords(set(words + keywords_list))
if 'about' in spec:
if 'about' in spec and len(resource.get_description()) < 1:
resource.set_description(spec['about'])
attr_spec = None
if 'columns' in spec:
@ -641,27 +671,38 @@ def write_metadata_file(datasource_path, spec, lineage_statement, keywords_list)
attr_spec = spec['fields']
if attr_spec:
for key, value in attr_spec.items():
about = value['about'] if 'about' in value else ''
units = format_unit(value['units']) if 'units' in value else ''
try:
resource.set_field_description(
key, description=about, units=units)
# field names in attr_spec are always lowercase, but the
# actual fieldname in the data could be any case because
# invest does not require case-sensitive fieldnames
yaml_key = _get_key(key, resource)
# Field description only gets set if its empty, i.e. ''
if len(resource.get_field_description(yaml_key)
.description.strip()) < 1:
about = value['about'] if 'about' in value else ''
resource.set_field_description(yaml_key, description=about)
# units only get set if empty
if len(resource.get_field_description(yaml_key)
.units.strip()) < 1:
units = format_unit(value['units']) if 'units' in value else ''
resource.set_field_description(yaml_key, units=units)
except KeyError as error:
# fields that are in the spec but missing
# from model results because they are conditional.
LOGGER.debug(error)
if 'bands' in spec:
for idx, value in spec['bands'].items():
try:
units = format_unit(spec['bands'][idx]['units'])
except KeyError:
units = ''
resource.set_band_description(idx, units=units)
if len(resource.get_band_description(idx).units) < 1:
try:
units = format_unit(spec['bands'][idx]['units'])
except KeyError:
units = ''
resource.set_band_description(idx, units=units)
resource.write()
resource.write(workspace=out_workspace)
def generate_metadata(model_module, args_dict):
def generate_metadata_for_outputs(model_module, args_dict):
"""Create metadata for all items in an invest model output workspace.
Args:
@ -695,7 +736,7 @@ def generate_metadata(model_module, args_dict):
if os.path.exists(full_path):
try:
write_metadata_file(
full_path, spec_data, lineage_statement, keywords)
full_path, spec_data, keywords, lineage_statement)
except ValueError as error:
# Some unsupported file formats, e.g. html
LOGGER.debug(error)

View File

@ -211,6 +211,60 @@ class DatastackArchiveTests(unittest.TestCase):
self.assertEqual(len(archived_params), 1) # sanity check
def test_datastack_metadata(self):
"""Test correct metadata is created for datastack
Copy files into a temp directory, create metadata for 1 file
"""
from natcap.invest import datastack
import geometamaker
params = {
'raster': os.path.join(DATA_DIR, "landcover.tif"),
'simple_table': os.path.join(DATA_DIR, "carbon_pools_samp.csv"),
}
# Copy params into new dir
temp_dir = os.path.join(self.workspace, "temp_dir")
os.mkdir(temp_dir)
for name, f in params.items():
shutil.copyfile(f, os.path.join(temp_dir, os.path.basename(f)))
params = {k: os.path.join(temp_dir, os.path.basename(f))
for k, f in params.items()}
# generate custom metadata for 1 file before building datastack
resource = geometamaker.describe(params['raster'])
resource.set_description("foo")
resource.set_keywords(["bar"])
resource.write()
archive_path = os.path.join(self.workspace, 'archive.invs.tar.gz')
datastack.build_datastack_archive(
params, 'test_datastack_modules.archive_extraction', archive_path)
# extract the archive
out_directory = os.path.join(self.workspace, 'extracted_archive')
datastack._tarfile_safe_extract(archive_path, out_directory)
# validate metadata in directory to ensure 2 yamls exist
files, messages = geometamaker.validate_dir(out_directory,
recursive=True)
self.assertEqual(len(files), 2)
self.assertFalse(any(messages))
# test that custom description and keyword are not overwritten and new
# keywords are added
raster_path = os.path.join(out_directory, "data",
"raster_raster", "landcover.tif")
resource = geometamaker.describe(raster_path)
self.assertEqual(resource.get_description(), "foo")
self.assertCountEqual(resource.get_keywords(),
["archive_extraction_model", "InVEST", "bar"])
def test_nonspatial_files(self):
"""Datastack: test nonspatial files."""
from natcap.invest import datastack

View File

@ -1,4 +1,5 @@
MODEL_SPEC = {
'model_id': 'archive_extraction_model',
'args': {
'blank': {'type': 'freestyle_string'},
'a': {'type': 'integer'},
@ -6,7 +7,7 @@ MODEL_SPEC = {
'c': {'type': 'freestyle_string'},
'foo': {'type': 'file'},
'bar': {'type': 'file'},
'data_dir': {'type': 'directory'},
'data_dir': {'type': 'directory', 'contents': {}},
'raster': {'type': 'raster'},
'vector': {'type': 'vector'},
'simple_table': {'type': 'csv'},

View File

@ -1,4 +1,5 @@
MODEL_SPEC = {
'model_id': 'duplicate_filepaths_model',
'args': {
'foo': {'type': 'file'},
'bar': {'type': 'file'},

View File

@ -1,6 +1,7 @@
MODEL_SPEC = {
'model_id': 'nonspatial_model',
'args': {
'some_file': {'type': 'file'},
'data_dir': {'type': 'directory'},
'data_dir': {'type': 'directory', 'contents': {}},
}
}

View File

@ -1,4 +1,5 @@
MODEL_SPEC = {
'model_id': 'raster_model',
'args': {
'raster': {'type': 'raster'},
}

View File

@ -1,9 +1,10 @@
MODEL_SPEC = {
'model_id': 'simple_model',
'args': {
'a': {'type': 'integer'},
'b': {'type': 'freestyle_string'},
'c': {'type': 'freestyle_string'},
'd': {'type': 'freestyle_string'},
'workspace_dir': {'type': 'directory'},
'workspace_dir': {'type': 'directory', 'contents': {}},
}
}

View File

@ -1,4 +1,5 @@
MODEL_SPEC = {
'model_id': 'ui_parameters_model',
'args': {
'foo': {'type': 'freestyle_string'},
'bar': {'type': 'freestyle_string'},

View File

@ -1,4 +1,5 @@
MODEL_SPEC = {
'model_id': 'vector_model',
'args': {
'vector': {'type': 'vector'},
}

View File

@ -338,7 +338,7 @@ class TestMetadataFromSpec(unittest.TestCase):
"""Override tearDown function to remove temporary directory."""
shutil.rmtree(self.workspace_dir)
def test_write_metadata(self):
def test_write_metadata_for_outputs(self):
"""Test writing metadata for an invest output workspace."""
# An example invest output spec
@ -389,7 +389,7 @@ class TestMetadataFromSpec(unittest.TestCase):
args_dict = {'workspace_dir': self.workspace_dir}
spec_utils.generate_metadata(model_module, args_dict)
spec_utils.generate_metadata_for_outputs(model_module, args_dict)
files, messages = geometamaker.validate_dir(
self.workspace_dir, recursive=True)
self.assertEqual(len(files), 2)