invest/tests/test_recreation.py

"""InVEST Recreation model tests."""
import datetime
import glob
import zipfile
import socket
import threading
import unittest
import tempfile
import shutil
import os
import functools
import logging
import json
import queue
import multiprocessing
import time

import numpy
from osgeo import gdal
from osgeo import ogr
from osgeo import osr
import pandas
import pygeoprocessing
import Pyro4
import shapely
import taskgraph
import warnings

from natcap.invest import utils

gdal.UseExceptions()
Pyro4.config.SERIALIZER = 'marshal'  # allow null bytes in strings

REGRESSION_DATA = os.path.join(
    os.path.dirname(__file__), '..', 'data', 'invest-test-data',
    'recreation')
SAMPLE_DATA = os.path.join(REGRESSION_DATA, 'input')

LOGGER = logging.getLogger('test_recreation')


def _timeout(max_timeout):
    """Timeout decorator, parameter in seconds."""
    def timeout_decorator(target):
        """Wrap the original function."""
        work_queue = queue.Queue()
        result_queue = queue.Queue()

        def worker():
            """Read one func,args,kwargs tuple and execute."""
            try:
                func, args, kwargs = work_queue.get()
                result = func(*args, **kwargs)
                result_queue.put(result)
            except Exception as e:
                result_queue.put(e)
                raise

        work_thread = threading.Thread(target=worker)
        work_thread.daemon = True
        work_thread.start()

        @functools.wraps(target)
        def func_wrapper(*args, **kwargs):
            """Closure for function."""
            try:
                work_queue.put((target, args, kwargs))
                result = result_queue.get(timeout=max_timeout)
                if isinstance(result, Exception):
                    raise result
                return result
            except queue.Empty:
                raise RuntimeError("Timeout of %f exceeded" % max_timeout)
        return func_wrapper
    return timeout_decorator


def _make_empty_files(base_file_list):
    """Create a list of empty files.

    Args:
        base_file_list: a list of paths to empty files to be created.

    Returns:
        None.

    """
    for file_path in base_file_list:
        with open(file_path, 'w') as open_file:
            open_file.write('')


def _resample_csv(base_csv_path, base_dst_path, resample_factor):
    """Resample (downsize) a csv file by a certain resample factor.

    Args:
        base_csv_path (str): path to the source csv file to be resampled.
        base_dst_path (str): path to the destination csv file.
        resample_factor (int): the factor used to determined how many rows
            should be skipped before writing a row to the destination file.

    Returns:
        None

    """
    with open(base_csv_path, 'r') as read_table:
        with open(base_dst_path, 'w') as write_table:
            for i, line in enumerate(read_table):
                if i % resample_factor == 0:
                    write_table.write(line)


class TestBufferedNumpyDiskMap(unittest.TestCase):
    """Tests for BufferedNumpyDiskMap."""

    def setUp(self):
        """Setup workspace."""
        self.workspace_dir = tempfile.mkdtemp()

    def tearDown(self):
        """Delete workspace."""
        shutil.rmtree(self.workspace_dir)

    def test_basic_operation(self):
        """Recreation test buffered file manager basic ops w/ no buffer."""
        from natcap.invest.recreation import buffered_numpy_disk_map
        file_manager = buffered_numpy_disk_map.BufferedNumpyDiskMap(
            os.path.join(self.workspace_dir, 'test'), 0)

        file_manager.append(1234, numpy.array([1, 2, 3, 4]))
        file_manager.append(1234, numpy.array([1, 2, 3, 4]))
        file_manager.append(4321, numpy.array([-4, -1, -2, 4]))

        numpy.testing.assert_equal(
            file_manager.read(1234), numpy.array([1, 2, 3, 4, 1, 2, 3, 4]))

        numpy.testing.assert_equal(
            file_manager.read(4321), numpy.array([-4, -1, -2, 4]))

        file_manager.delete(1234)
        with self.assertRaises(IOError):
            file_manager.read(1234)


class TestRecServerLoop(unittest.TestCase):
    """Tests that use the rec server execute loop running in another process."""

    def setUp(self):
        """Setup workspace."""
        from natcap.invest.recreation import recmodel_server

        self.workspace_dir = tempfile.mkdtemp()
        self.resampled_data_path = os.path.join(
            self.workspace_dir, 'resampled_data.csv')
        _resample_csv(
            os.path.join(SAMPLE_DATA, 'sample_data.csv'),
            self.resampled_data_path, resample_factor=10)

        # attempt to get an open port; could result in race condition but
        # will be okay for a test. if this test ever fails because of port
        # in use, that's probably why
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.bind(('', 0))
        self.port = sock.getsockname()[1]
        sock.close()
        sock = None

        server_args = {
            'hostname': 'localhost',
            'port': self.port,
            'raw_csv_point_data_path': self.resampled_data_path,
            'cache_workspace': self.workspace_dir,
            'min_year': 2008,
            'max_year': 2015,
            'max_points_per_node': 200,
        }

        self.server_process = multiprocessing.Process(
            target=recmodel_server.execute, args=(server_args,), daemon=False)
        self.server_process.start()
        # need a few seconds for the server to be ready
        # Dave suggested that if this turns out to be flaky, we could instead
        # listen for the stdout from the server process indicating it's done
        # initializing, or poll the server and retry multiple times.
        time.sleep(5)

    def tearDown(self):
        """Delete workspace."""
        self.server_process.terminate()
        shutil.rmtree(self.workspace_dir, ignore_errors=True)

    def test_all_metrics_local_server(self):
        """Recreation test with all but trivial predictor metrics.

        Executes Recreation model all the way through scenario prediction.
        With this 'extra_fields_features' AOI, we also cover two edge cases:
        1) the AOI has a pre-existing field that the model wishes to create.
        2) the AOI has features only covering nodata raster predictor values.
        """
        from natcap.invest.recreation import recmodel_client
        args = {
            'aoi_path': os.path.join(
                SAMPLE_DATA, 'andros_aoi_with_extra_fields_features.shp'),
            'compute_regression': True,
            'start_year': '2008',
            'end_year': '2014',
            'grid_aoi': False,
            'predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_all.csv'),
            'scenario_predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_all.csv'),
            'results_suffix': '',
            'workspace_dir': self.workspace_dir,
            'hostname': 'localhost',
            'port': self.port,
        }
        recmodel_client.execute(args)

        out_grid_vector_path = os.path.join(
            args['workspace_dir'], 'predictor_data.shp')
        expected_grid_vector_path = os.path.join(
            REGRESSION_DATA, 'predictor_data_all_metrics.shp')
        utils._assert_vectors_equal(
            out_grid_vector_path, expected_grid_vector_path, 1e-3)

        out_scenario_path = os.path.join(
            args['workspace_dir'], 'scenario_results.shp')
        expected_scenario_path = os.path.join(
            REGRESSION_DATA, 'scenario_results_all_metrics.shp')
        utils._assert_vectors_equal(
            out_scenario_path, expected_scenario_path, 1e-3)

    @_timeout(30.0)
    def test_execute_local_server(self):
        """Recreation base regression test on sample data on local server.

        Executes Recreation model all the way through scenario prediction.
        With this florida AOI, raster and vector predictors do not
        intersect the AOI. This makes for a fast test and incidentally
        covers an edge case.
        """
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(
                SAMPLE_DATA, 'local_recreation_aoi_florida_utm18n.shp'),
            'cell_size': 40000.0,
            'compute_regression': True,
            'start_year': '2008',
            'end_year': '2014',
            'hostname': 'localhost',
            'port': self.port,
            'grid_aoi': True,
            'grid_type': 'hexagon',
            'predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors.csv'),
            'results_suffix': '',
            'scenario_predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_scenario.csv'),
            'workspace_dir': self.workspace_dir,
        }

        recmodel_client.execute(args)

        _assert_regression_results_eq(
            args['workspace_dir'],
            os.path.join(REGRESSION_DATA, 'file_list_base_florida_aoi.txt'),
            os.path.join(args['workspace_dir'], 'scenario_results.shp'),
            os.path.join(REGRESSION_DATA, 'local_server_scenario_results.csv'))

    @_timeout(30.0)
    def test_workspace_fetcher(self):
        """Recreation test workspace fetcher on a local Pyro4 empty server."""
        from natcap.invest.recreation import recmodel_workspace_fetcher

        path = "PYRO:natcap.invest.recreation@localhost:%s" % self.port
        LOGGER.info("Local server path %s", path)
        recreation_server = Pyro4.Proxy(path)
        aoi_path = os.path.join(
            SAMPLE_DATA, 'test_aoi_for_subset.shp')
        basename = os.path.splitext(aoi_path)[0]
        aoi_archive_path = os.path.join(
            self.workspace_dir, 'aoi_zipped.zip')
        with zipfile.ZipFile(aoi_archive_path, 'w') as myzip:
            for filename in glob.glob(basename + '.*'):
                myzip.write(filename, os.path.basename(filename))

        # convert shapefile to binary string for serialization
        with open(aoi_archive_path, 'rb') as file:
            zip_file_binary = file.read()
        date_range = (('2005-01-01'), ('2014-12-31'))
        out_vector_filename = 'test_aoi_for_subset_pud.shp'

        _, workspace_id = (
            recreation_server.calc_photo_user_days_in_aoi(
                zip_file_binary, date_range, out_vector_filename))
        fetcher_args = {
            'workspace_dir': self.workspace_dir,
            'hostname': 'localhost',
            'port': self.port,
            'workspace_id': workspace_id,
        }
        try:
            recmodel_workspace_fetcher.execute(fetcher_args)
        except:
            LOGGER.error(
                "Server process failed (%s) is_alive=%s",
                str(server_thread), server_thread.is_alive())
            raise

        out_workspace_dir = os.path.join(
            self.workspace_dir, 'workspace_zip')
        os.makedirs(out_workspace_dir)
        workspace_zip_path = os.path.join(
            self.workspace_dir, workspace_id + '.zip')
        zipfile.ZipFile(workspace_zip_path, 'r').extractall(
            out_workspace_dir)
        utils._assert_vectors_equal(
            aoi_path,
            os.path.join(out_workspace_dir, 'test_aoi_for_subset.shp'))

    def test_results_suffix_on_serverside_files(self):
        """Recreation test suffix gets added to files created on server."""
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(
                SAMPLE_DATA, 'andros_aoi_with_extra_fields_features.shp'),
            'compute_regression': False,
            'start_year': '2014',
            'end_year': '2015',
            'grid_aoi': False,
            'results_suffix': 'hello',
            'workspace_dir': self.workspace_dir,
            'hostname': 'localhost',
            'port': self.port,
        }
        recmodel_client.execute(args)

        self.assertTrue(os.path.exists(
            os.path.join(args['workspace_dir'], 'monthly_table_hello.csv')))
        self.assertTrue(os.path.exists(
            os.path.join(args['workspace_dir'], 'pud_results_hello.shp')))


class TestRecServer(unittest.TestCase):
    """Tests for recmodel_server functions and the RecModel object."""

    def setUp(self):
        """Setup workspace."""
        self.workspace_dir = tempfile.mkdtemp()
        self.resampled_data_path = os.path.join(
            self.workspace_dir, 'resampled_data.csv')
        _resample_csv(
            os.path.join(SAMPLE_DATA, 'sample_data.csv'),
            self.resampled_data_path, resample_factor=10)

    def tearDown(self):
        """Delete workspace."""
        shutil.rmtree(self.workspace_dir, ignore_errors=True)

    def test_hashfile(self):
        """Recreation test for hash of file."""
        from natcap.invest.recreation import recmodel_server
        file_hash = recmodel_server._hashfile(
            self.resampled_data_path, blocksize=2**20, fast_hash=False)
        # The exact encoded string that is hashed is dependent on python
        # version, with Python 3 including b prefix and \n suffix.
        # these hashes are for [py2.7, py3.6]
        self.assertIn(file_hash, ['c052e7a0a4c5e528', 'c8054b109d7a9d2a'])

    def test_hashfile_fast(self):
        """Recreation test for hash and fast hash of file."""
        from natcap.invest.recreation import recmodel_server
        file_hash = recmodel_server._hashfile(
            self.resampled_data_path, blocksize=2**20, fast_hash=True)
        # we can't assert the full hash since it is dependant on the file
        # last access time and we can't reliably set that in Python.
        # instead we just check that at the very least it ends with _fast_hash
        self.assertTrue(file_hash.endswith('_fast_hash'))

    def test_year_order(self):
        """Recreation ensure that end year < start year raise ValueError."""
        from natcap.invest.recreation import recmodel_server

        with self.assertRaises(ValueError):
            # intentionally construct start year > end year
            recmodel_server.RecModel(
                self.resampled_data_path,
                2014, 2005, os.path.join(self.workspace_dir, 'server_cache'))

    def test_local_aggregate_points(self):
        """Recreation test single threaded local AOI aggregate calculation."""
        from natcap.invest.recreation import recmodel_server

        recreation_server = recmodel_server.RecModel(
            self.resampled_data_path, 2005, 2014,
            os.path.join(self.workspace_dir, 'server_cache'))

        aoi_path = os.path.join(SAMPLE_DATA, 'test_aoi_for_subset.shp')

        basename = os.path.splitext(aoi_path)[0]
        aoi_archive_path = os.path.join(
            self.workspace_dir, 'aoi_zipped.zip')
        with zipfile.ZipFile(aoi_archive_path, 'w') as myzip:
            for filename in glob.glob(basename + '.*'):
                myzip.write(filename, os.path.basename(filename))

        # convert shapefile to binary string for serialization
        with open(aoi_archive_path, 'rb') as file:
            zip_file_binary = file.read()

        # transfer zipped file to server
        date_range = (('2005-01-01'), ('2014-12-31'))
        out_vector_filename = 'test_aoi_for_subset_pud.shp'
        zip_result, workspace_id = (
            recreation_server.calc_photo_user_days_in_aoi(
                zip_file_binary, date_range, out_vector_filename))

        # unpack result
        result_zip_path = os.path.join(self.workspace_dir, 'pud_result.zip')
        with open(result_zip_path, 'wb') as file:
            file.write(zip_result)
        zipfile.ZipFile(result_zip_path, 'r').extractall(self.workspace_dir)

        result_vector_path = os.path.join(
            self.workspace_dir, out_vector_filename)
        expected_vector_path = os.path.join(
            REGRESSION_DATA, 'test_aoi_for_subset_pud.shp')
        utils._assert_vectors_equal(expected_vector_path, result_vector_path)

        # ensure the remote workspace is as expected
        workspace_zip_binary = recreation_server.fetch_workspace_aoi(
            workspace_id)
        out_workspace_dir = os.path.join(self.workspace_dir, 'workspace_zip')
        os.makedirs(out_workspace_dir)
        workspace_zip_path = os.path.join(out_workspace_dir, 'workspace.zip')
        with open(workspace_zip_path, 'wb') as file:
            file.write(workspace_zip_binary)
        zipfile.ZipFile(workspace_zip_path, 'r').extractall(out_workspace_dir)
        utils._assert_vectors_equal(
            aoi_path,
            os.path.join(out_workspace_dir, 'test_aoi_for_subset.shp'))

    def test_local_calc_poly_pud(self):
        """Recreation test single threaded local PUD calculation."""
        from natcap.invest.recreation import recmodel_server

        recreation_server = recmodel_server.RecModel(
            self.resampled_data_path,
            2005, 2014, os.path.join(self.workspace_dir, 'server_cache'))

        date_range = (
            numpy.datetime64('2005-01-01'),
            numpy.datetime64('2014-12-31'))

        poly_test_queue = queue.Queue()
        poly_test_queue.put(0)
        poly_test_queue.put('STOP')
        pud_poly_feature_queue = queue.Queue()
        recmodel_server._calc_poly_pud(
            recreation_server.qt_pickle_filename,
            os.path.join(SAMPLE_DATA, 'test_aoi_for_subset.shp'),
            date_range, poly_test_queue, pud_poly_feature_queue)

        # assert annual average PUD is the same as regression
        self.assertEqual(
            83.2, pud_poly_feature_queue.get()[1][0])

    def test_local_calc_poly_pud_bad_aoi(self):
        """Recreation test PUD calculation with missing AOI features."""
        from natcap.invest.recreation import recmodel_server

        recreation_server = recmodel_server.RecModel(
            self.resampled_data_path,
            2005, 2014, os.path.join(self.workspace_dir, 'server_cache'))

        date_range = (
            numpy.datetime64('2005-01-01'),
            numpy.datetime64('2014-12-31'))

        aoi_vector_path = os.path.join(self.workspace_dir, 'aoi.gpkg')
        gpkg_driver = gdal.GetDriverByName('GPKG')
        srs = osr.SpatialReference()
        srs.ImportFromEPSG(32731)  # WGS84/UTM zone 31s
        target_vector = gpkg_driver.Create(
            aoi_vector_path, 0, 0, 0, gdal.GDT_Unknown)
        target_layer = target_vector.CreateLayer(
            'target_layer', srs, ogr.wkbUnknown)

        # Testing with an AOI of 2 features, one is missing Geometry.
        input_geom_list = [
            None,
            ogr.CreateGeometryFromWkt(
                'POLYGON ((1 1, 1 0, 0 0, 0 1, 1 1))')]
        poly_test_queue = queue.Queue()
        poly_test_queue.put(1)  # gpkg FIDs start at 1
        poly_test_queue.put(2)
        target_layer.StartTransaction()
        for geometry in input_geom_list:
            feature = ogr.Feature(target_layer.GetLayerDefn())
            feature.SetGeometry(geometry)
            target_layer.CreateFeature(feature)
        target_layer.CommitTransaction()
        poly_test_queue.put('STOP')
        target_layer = None
        target_vector = None

        pud_poly_feature_queue = queue.Queue()
        recmodel_server._calc_poly_pud(
            recreation_server.qt_pickle_filename,
            aoi_vector_path, date_range, poly_test_queue,
            pud_poly_feature_queue)

        # assert PUD was calculated for the one good AOI feature.
        self.assertEqual(
            0.0, pud_poly_feature_queue.get()[1][0])

    def test_local_calc_existing_cached(self):
        """Recreation local PUD calculation on existing quadtree."""
        from natcap.invest.recreation import recmodel_server

        recreation_server = recmodel_server.RecModel(
            self.resampled_data_path,
            2005, 2014, os.path.join(self.workspace_dir, 'server_cache'))
        recreation_server = None
        # This will not generate a new quadtree but instead load existing one
        recreation_server = recmodel_server.RecModel(
            self.resampled_data_path,
            2005, 2014, os.path.join(self.workspace_dir, 'server_cache'))

        date_range = (
            numpy.datetime64('2005-01-01'),
            numpy.datetime64('2014-12-31'))

        poly_test_queue = queue.Queue()
        poly_test_queue.put(0)
        poly_test_queue.put('STOP')
        pud_poly_feature_queue = queue.Queue()
        recmodel_server._calc_poly_pud(
            recreation_server.qt_pickle_filename,
            os.path.join(SAMPLE_DATA, 'test_aoi_for_subset.shp'),
            date_range, poly_test_queue, pud_poly_feature_queue)

        # assert annual average PUD is the same as regression
        self.assertEqual(
            83.2, pud_poly_feature_queue.get()[1][0])

    def test_parse_input_csv(self):
        """Recreation test parsing raw CSV."""
        from natcap.invest.recreation import recmodel_server

        block_offset_size_queue = queue.Queue()
        block_offset_size_queue.put((0, 2**10))
        block_offset_size_queue.put('STOP')
        numpy_array_queue = queue.Queue()
        recmodel_server._parse_input_csv(
            block_offset_size_queue, self.resampled_data_path,
            numpy_array_queue)
        val = recmodel_server._numpy_loads(numpy_array_queue.get())
        # we know what the first date is
        self.assertEqual(val[0][0], datetime.date(2013, 3, 16))

    def test_numpy_pickling_queue(self):
        """Recreation test _numpy_dumps and _numpy_loads"""
        from natcap.invest.recreation import recmodel_server

        numpy_array_queue = multiprocessing.Queue()
        array = numpy.empty(1, dtype='datetime64,f4')
        numpy_array_queue.put(recmodel_server._numpy_dumps(array))

        out_array = recmodel_server._numpy_loads(numpy_array_queue.get())
        numpy.testing.assert_equal(out_array, array)
        # without _numpy_loads, the queue pickles the array imperfectly,
        # adding a metadata value to the `datetime64` dtype.
        # assert that this doesn't happen. 'f0' is the first subdtype.
        self.assertEqual(out_array.dtype['f0'].metadata, None)

        # assert that saving the array does not raise a warning
        with warnings.catch_warnings(record=True) as ws:
            # cause all warnings to always be triggered
            warnings.simplefilter("always")
            numpy.save(os.path.join(self.workspace_dir, 'out'), out_array)
            # assert that no warning was raised
            self.assertTrue(len(ws) == 0)


class TestLocalRecServer(unittest.TestCase):
    """Tests using a local rec server."""

    def setUp(self):
        """Setup workspace and server."""
        from natcap.invest.recreation import recmodel_server
        self.workspace_dir = tempfile.mkdtemp()
        self.recreation_server = recmodel_server.RecModel(
            os.path.join(SAMPLE_DATA, 'sample_data.csv'),
            2005, 2014, os.path.join(self.workspace_dir, 'server_cache'))

    def tearDown(self):
        """Delete workspace."""
        shutil.rmtree(self.workspace_dir)

    def test_local_aoi(self):
        """Recreation test local AOI with local server."""
        aoi_path = os.path.join(SAMPLE_DATA, 'test_local_aoi_for_subset.shp')
        date_range = (
            numpy.datetime64('2010-01-01'),
            numpy.datetime64('2014-12-31'))
        out_vector_filename = os.path.join(self.workspace_dir, 'pud.shp')
        self.recreation_server._calc_aggregated_points_in_aoi(
            aoi_path, self.workspace_dir, date_range, out_vector_filename)

        with open(os.path.join(
                self.workspace_dir, 'monthly_table.csv'), 'r') as file:
            output_lines = file.readlines()
        with open(os.path.join(
                REGRESSION_DATA, 'expected_monthly_table_for_subset.csv'),
                'r') as file:
            expected_lines = file.readlines()

        if output_lines != expected_lines:
            raise ValueError(
                "Output table not the same as input.\n"
                "Expected:\n%s\nGot:\n%s" % (expected_lines, output_lines))


class RecreationRegressionTests(unittest.TestCase):
    """Regression tests for InVEST Recreation model."""

    def setUp(self):
        """Setup workspace directory."""
        # this lets us delete the workspace after its done no matter the
        # the rest result
        self.workspace_dir = tempfile.mkdtemp()

    def tearDown(self):
        """Delete workspace."""
        shutil.rmtree(self.workspace_dir)

    def test_data_different_projection(self):
        """Recreation can validate if data in different projection."""
        from natcap.invest.recreation import recmodel_client

        response_vector_path = os.path.join(SAMPLE_DATA, 'andros_aoi.shp')
        table_path = os.path.join(
            SAMPLE_DATA, 'predictors_wrong_projection.csv')
        msg = recmodel_client._validate_same_projection(
                response_vector_path, table_path)
        self.assertIn('did not match the projection', msg)

    def test_different_tables(self):
        """Recreation can validate if scenario ids different than predictor."""
        from natcap.invest.recreation import recmodel_client

        base_table_path = os.path.join(
            SAMPLE_DATA, 'predictors_all.csv')
        scenario_table_path = os.path.join(
            SAMPLE_DATA, 'predictors.csv')
        msg = recmodel_client._validate_same_ids_and_types(
                base_table_path, scenario_table_path)
        self.assertIn('table pairs unequal', msg)

    def test_delay_op(self):
        """Recreation coverage of delay op function."""
        from natcap.invest.recreation import recmodel_client

        # not much to test here but that the function is invoked
        # guarantee the time has exceeded since we can't have negative time
        last_time = -1.0
        time_delay = 1.0
        called = [False]

        def func():
            """Set `called` to True."""
            called[0] = True
        recmodel_client.delay_op(last_time, time_delay, func)
        self.assertTrue(called[0])

    def test_raster_sum_mean_no_nodata(self):
        """Recreation test sum/mean if raster doesn't have nodata defined."""
        from natcap.invest.recreation import recmodel_client

        # The following raster has no nodata value
        raster_path = os.path.join(SAMPLE_DATA, 'no_nodata_raster.tif')

        response_vector_path = os.path.join(SAMPLE_DATA, 'andros_aoi.shp')
        target_path = os.path.join(self.workspace_dir, "predictor.json")
        recmodel_client._raster_sum_mean(
            raster_path, "mean", response_vector_path, target_path)

        with open(target_path, 'r') as file:
            predictor_results = json.load(file)
        # These constants were calculated by hand by Dave.
        numpy.testing.assert_allclose(
            predictor_results['0'], 13.0, rtol=0, atol=1e-6)

    def test_raster_sum_mean_nodata(self):
        """Recreation test sum/mean if raster has no valid pixels.

        This may be a raster that does not intersect with the AOI, or
        one that does intersect, but is entirely nodata within the AOI.
        Such a raster is not usable as a predictor variable.
        """
        from natcap.invest.recreation import recmodel_client

        # The following raster has only nodata pixels.
        raster_path = os.path.join(SAMPLE_DATA, 'nodata_raster.tif')
        response_vector_path = os.path.join(SAMPLE_DATA, 'andros_aoi.shp')
        target_path = os.path.join(self.workspace_dir, "predictor.json")

        recmodel_client._raster_sum_mean(
            raster_path, "sum", response_vector_path, target_path)

        with open(target_path, 'r') as file:
            predictor_results = json.load(file)
        # Assert that target file was written and it is an empty dictionary
        assert(len(predictor_results) == 0)

    def test_overlapping_features_in_polygon_predictor(self):
        """Recreation test overlapping predictor features not double-counted.

        If a polygon predictor contains features that overlap, the overlapping
        area should only be counted once when calculating `polygon_area_coverage`
        or `polygon_percent_coverage`.
        """
        from natcap.invest.recreation import recmodel_client

        response_vector_path = os.path.join(self.workspace_dir, 'aoi.geojson')
        response_polygons_pickle_path = os.path.join(
            self.workspace_dir, 'response.pickle')
        predictor_vector_path = os.path.join(
            self.workspace_dir, 'predictor.geojson')
        predictor_target_path = os.path.join(
            self.workspace_dir, 'predictor.json')

        srs = osr.SpatialReference()
        srs.ImportFromEPSG(32610)  # a UTM system

        # A unit square
        response_geom = shapely.geometry.Polygon(
            ((0., 0.), (0., 1.), (1., 1.), (1., 0.), (0., 0.)))
        pygeoprocessing.shapely_geometry_to_vector(
            [response_geom],
            response_vector_path,
            srs.ExportToWkt(),
            'GEOJSON')

        # Two overlapping polygons, including a unit square
        predictor_geom_list = [
            shapely.geometry.Polygon(
                ((0., 0.), (0., 1.), (1., 1.), (1., 0.), (0., 0.))),
            shapely.geometry.Polygon(
                ((0., 0.), (0., 0.5), (0.5, 0.5), (0.5, 0.), (0., 0.)))]
        pygeoprocessing.shapely_geometry_to_vector(
            predictor_geom_list,
            predictor_vector_path,
            srs.ExportToWkt(),
            'GEOJSON')

        recmodel_client._prepare_response_polygons_lookup(
            response_vector_path, response_polygons_pickle_path)
        recmodel_client._polygon_area(
            'polygon_area_coverage',
            response_polygons_pickle_path,
            predictor_vector_path,
            predictor_target_path)

        with open(predictor_target_path, 'r') as file:
            data = json.load(file)
        actual_value = list(data.values())[0]
        expected_value = 1
        self.assertEqual(actual_value, expected_value)

    def test_least_squares_regression(self):
        """Recreation regression test for the least-squares linear model."""
        from natcap.invest.recreation import recmodel_client

        coefficient_vector_path = os.path.join(
            REGRESSION_DATA, 'predictor_data.shp')
        response_vector_path = os.path.join(
            REGRESSION_DATA, 'predictor_data_pud.shp')
        response_id = 'PUD_YR_AVG'

        _, coefficients, ssres, r_sq, r_sq_adj, std_err, dof, se_est = (
            recmodel_client._build_regression(
                response_vector_path, coefficient_vector_path, response_id))

        results = {}
        results['coefficients'] = coefficients
        results['ssres'] = ssres
        results['r_sq'] = r_sq
        results['r_sq_adj'] = r_sq_adj
        results['std_err'] = std_err
        results['dof'] = dof
        results['se_est'] = se_est

        # Dave created these numbers using Recreation model release/3.5.0
        expected_results = {}
        expected_results['coefficients'] = [
            -3.67484238e-03, -8.76864968e-06, 1.75244536e-01, 2.07040116e-01,
            6.59076098e-01]
        expected_results['ssres'] = 11.03734250869611
        expected_results['r_sq'] = 0.5768926587089602
        expected_results['r_sq_adj'] = 0.5256069203706524
        expected_results['std_err'] = 0.5783294255923199
        expected_results['dof'] = 33
        expected_results['se_est'] = [
            5.93275522e-03, 8.49251058e-06, 1.72921342e-01, 6.39079593e-02,
            3.98165865e-01]

        for key in expected_results:
            numpy.testing.assert_allclose(results[key], expected_results[key])

    @unittest.skip("skipping to avoid remote server call (issue #3753)")
    def test_base_execute(self):
        """Recreation base regression test on fast sample data.

        Executes Recreation model with default data and default arguments.
        """
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'cell_size': 40000.0,
            'compute_regression': True,
            'start_year': '2005',
            'end_year': '2014',
            'grid_aoi': True,
            'grid_type': 'hexagon',
            'predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors.csv'),
            'results_suffix': '',
            'scenario_predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_scenario.csv'),
            'workspace_dir': self.workspace_dir,
        }

        recmodel_client.execute(args)
        _assert_regression_results_eq(
            args['workspace_dir'],
            os.path.join(REGRESSION_DATA, 'file_list_base.txt'),
            os.path.join(args['workspace_dir'], 'scenario_results.shp'),
            os.path.join(REGRESSION_DATA, 'scenario_results_40000.csv'))

    def test_square_grid(self):
        """Recreation square grid regression test."""
        from natcap.invest.recreation import recmodel_client

        out_grid_vector_path = os.path.join(
            self.workspace_dir, 'square_grid_vector_path.shp')

        recmodel_client._grid_vector(
            os.path.join(SAMPLE_DATA, 'andros_aoi.shp'), 'square', 20000.0,
            out_grid_vector_path)

        expected_grid_vector_path = os.path.join(
            REGRESSION_DATA, 'square_grid_vector_path.shp')

        utils._assert_vectors_equal(
            expected_grid_vector_path, out_grid_vector_path)

    def test_hex_grid(self):
        """Recreation hex grid regression test."""
        from natcap.invest.recreation import recmodel_client

        out_grid_vector_path = os.path.join(
            self.workspace_dir, 'hex_grid_vector_path.shp')

        recmodel_client._grid_vector(
            os.path.join(SAMPLE_DATA, 'andros_aoi.shp'), 'hexagon', 20000.0,
            out_grid_vector_path)

        expected_grid_vector_path = os.path.join(
            REGRESSION_DATA, 'hex_grid_vector_path.shp')

        utils._assert_vectors_equal(
            expected_grid_vector_path, out_grid_vector_path)

    @unittest.skip("skipping to avoid remote server call (issue #3753)")
    def test_no_grid_execute(self):
        """Recreation execute on ungridded AOI."""
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'compute_regression': False,
            'start_year': '2005',
            'end_year': '2014',
            'grid_aoi': False,
            'results_suffix': '',
            'workspace_dir': self.workspace_dir,
        }

        recmodel_client.execute(args)

        expected_result_table = pandas.read_csv(os.path.join(
            REGRESSION_DATA, 'expected_monthly_table_for_no_grid.csv'))
        result_table = pandas.read_csv(
            os.path.join(self.workspace_dir, 'monthly_table.csv'))
        pandas.testing.assert_frame_equal(
            expected_result_table, result_table, check_dtype=False)

    def test_predictor_id_too_long(self):
        """Recreation can validate predictor ID length."""
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'compute_regression': True,
            'start_year': '2005',
            'end_year': '2014',
            'grid_aoi': True,
            'grid_type': 'square',
            'cell_size': 20000,
            'predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_id_too_long.csv'),
            'results_suffix': '',
            'workspace_dir': self.workspace_dir,
        }
        msgs = recmodel_client.validate(args)
        self.assertIn('more than 10 characters long', msgs[0][1])

    def test_existing_output_shapefiles(self):
        """Recreation grid test when output files need to be overwritten."""
        from natcap.invest.recreation import recmodel_client

        out_grid_vector_path = os.path.join(
            self.workspace_dir, 'hex_grid_vector_path.shp')

        recmodel_client._grid_vector(
            os.path.join(SAMPLE_DATA, 'andros_aoi.shp'), 'hexagon', 20000.0,
            out_grid_vector_path)
        # overwrite output
        recmodel_client._grid_vector(
            os.path.join(SAMPLE_DATA, 'andros_aoi.shp'), 'hexagon', 20000.0,
            out_grid_vector_path)

        expected_grid_vector_path = os.path.join(
            REGRESSION_DATA, 'hex_grid_vector_path.shp')

        utils._assert_vectors_equal(
            expected_grid_vector_path, out_grid_vector_path)

    def test_existing_regression_coef(self):
        """Recreation test regression coefficients handle existing output."""
        from natcap.invest.recreation import recmodel_client
        from natcap.invest import validation

        # Initialize a TaskGraph
        taskgraph_db_dir = os.path.join(
            self.workspace_dir, '_taskgraph_working_dir')
        n_workers = -1  # single process mode.
        task_graph = taskgraph.TaskGraph(taskgraph_db_dir, n_workers)

        response_vector_path = os.path.join(
            self.workspace_dir, 'no_grid_vector_path.shp')
        response_polygons_lookup_path = os.path.join(
            self.workspace_dir, 'response_polygons_lookup.pickle')
        recmodel_client._copy_aoi_no_grid(
            os.path.join(SAMPLE_DATA, 'andros_aoi.shp'), response_vector_path)

        predictor_table_path = os.path.join(SAMPLE_DATA, 'predictors.csv')

        # make outputs to be overwritten
        predictor_dict = validation.get_validated_dataframe(
            predictor_table_path,
            **recmodel_client.MODEL_SPEC['args']['predictor_table_path']
        ).to_dict(orient='index')
        predictor_list = predictor_dict.keys()
        tmp_working_dir = tempfile.mkdtemp(dir=self.workspace_dir)
        empty_json_list = [
            os.path.join(tmp_working_dir, x + '.json') for x in predictor_list]
        out_coefficient_vector_path = os.path.join(
            self.workspace_dir, 'out_coefficient_vector.shp')
        _make_empty_files(
            [out_coefficient_vector_path] + empty_json_list)

        prepare_response_polygons_task = task_graph.add_task(
            func=recmodel_client._prepare_response_polygons_lookup,
            args=(response_vector_path,
                  response_polygons_lookup_path),
            target_path_list=[response_polygons_lookup_path],
            task_name='prepare response polygons for geoprocessing')
        # build again to test against overwriting output
        recmodel_client._schedule_predictor_data_processing(
            response_vector_path, response_polygons_lookup_path,
            prepare_response_polygons_task, predictor_table_path,
            out_coefficient_vector_path, tmp_working_dir, task_graph)

        expected_coeff_vector_path = os.path.join(
            REGRESSION_DATA, 'test_regression_coefficients.shp')

        utils._assert_vectors_equal(
            expected_coeff_vector_path, out_coefficient_vector_path, 1e-6)

    def test_predictor_table_absolute_paths(self):
        """Recreation test validation from full path."""
        from natcap.invest.recreation import recmodel_client

        response_vector_path = os.path.join(
            self.workspace_dir, 'no_grid_vector_path.shp')
        recmodel_client._copy_aoi_no_grid(
            os.path.join(SAMPLE_DATA, 'andros_aoi.shp'), response_vector_path)

        predictor_table_path = os.path.join(
            self.workspace_dir, 'predictors.csv')

        # these are absolute paths for predictor data
        predictor_list = [
            ('ports',
             os.path.join(SAMPLE_DATA, 'predictors', 'dredged_ports.shp'),
             'point_count'),
            ('airdist',
             os.path.join(SAMPLE_DATA, 'predictors', 'airport.shp'),
             'point_nearest_distance'),
            ('bonefish',
             os.path.join(SAMPLE_DATA, 'predictors', 'bonefish_simp.shp'),
             'polygon_percent_coverage'),
            ('bathy',
             os.path.join(SAMPLE_DATA, 'predictors', 'dem90m_coarse.tif'),
             'raster_mean'),
            ]

        with open(predictor_table_path, 'w') as table_file:
            table_file.write('id,path,type\n')
            for predictor_id, path, predictor_type in predictor_list:
                table_file.write(
                    '%s,%s,%s\n' % (predictor_id, path, predictor_type))

        # The expected behavior here is that _validate_same_projection does
        # not raise a ValueError.  The try/except block makes that explicit
        # and also explicitly fails the test if it does. Note if a different
        # exception is raised the test will raise an error, thus
        # differentiating between a failed test and an error.
        try:
            recmodel_client._validate_same_projection(
                response_vector_path, predictor_table_path)
        except ValueError:
            self.fail(
                "_validate_same_projection raised ValueError unexpectedly!")

    def test_year_order(self):
        """Recreation ensure that end year < start year raise ValueError."""
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'cell_size': 7000.0,
            'compute_regression': True,
            'start_year': '2014',  # note start_year > end_year
            'end_year': '2005',
            'grid_aoi': True,
            'grid_type': 'hexagon',
            'predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors.csv'),
            'results_suffix': '',
            'scenario_predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_scenario.csv'),
            'workspace_dir': self.workspace_dir,
        }
        msgs = recmodel_client.validate(args)
        self.assertEqual(
            'Start year must be less than or equal to end year.', msgs[0][1])
        with self.assertRaises(ValueError):
            recmodel_client.execute(args)

    def test_bad_grid_type(self):
        """Recreation ensure that bad grid type raises ValueError."""
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'cell_size': 7000.0,
            'compute_regression': False,
            'start_year': '2005',
            'end_year': '2014',
            'grid_aoi': True,
            'grid_type': 'circle',  # intentionally bad gridtype
            'results_suffix': '',
            'workspace_dir': self.workspace_dir,
        }

        with self.assertRaises(ValueError):
            recmodel_client.execute(args)

    def test_start_year_out_of_range(self):
        """Recreation that start_year out of range raise ValueError."""
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'cell_size': 7000.0,
            'compute_regression': True,
            'start_year': '1219',  # start year ridiculously out of range
            'end_year': '2014',
            'grid_aoi': True,
            'grid_type': 'hexagon',
            'predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors.csv'),
            'results_suffix': '',
            'scenario_predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_scenario.csv'),
            'workspace_dir': self.workspace_dir,
        }

        with self.assertRaises(ValueError):
            recmodel_client.execute(args)

    def test_end_year_out_of_range(self):
        """Recreation that end_year out of range raise ValueError."""
        from natcap.invest.recreation import recmodel_client

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'cell_size': 7000.0,
            'compute_regression': True,
            'start_year': '2005',
            'end_year': '2219',  # end year ridiculously out of range
            'grid_aoi': True,
            'grid_type': 'hexagon',
            'predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors.csv'),
            'results_suffix': '',
            'scenario_predictor_table_path': os.path.join(
                SAMPLE_DATA, 'predictors_scenario.csv'),
            'workspace_dir': self.workspace_dir,
        }

        with self.assertRaises(ValueError):
            recmodel_client.execute(args)


class RecreationValidationTests(unittest.TestCase):
    """Tests for the Recreation Model MODEL_SPEC and validation."""

    def setUp(self):
        """Create a temporary workspace."""
        self.workspace_dir = tempfile.mkdtemp()
        self.base_required_keys = [
            'workspace_dir',
            'aoi_path',
            'start_year',
            'end_year'
        ]

    def tearDown(self):
        """Remove the temporary workspace after a test."""
        shutil.rmtree(self.workspace_dir)

    def test_missing_keys(self):
        """Recreation Validate: assert missing required keys."""
        from natcap.invest.recreation import recmodel_client
        from natcap.invest import validation

        validation_errors = recmodel_client.validate({})  # empty args dict.
        invalid_keys = validation.get_invalid_keys(validation_errors)
        expected_missing_keys = set(self.base_required_keys)
        self.assertEqual(invalid_keys, expected_missing_keys)

    def test_missing_keys_grid_aoi(self):
        """Recreation Validate: assert missing keys for grid option."""
        from natcap.invest.recreation import recmodel_client
        from natcap.invest import validation

        validation_errors = recmodel_client.validate({'grid_aoi': True})
        invalid_keys = validation.get_invalid_keys(validation_errors)
        expected_missing_keys = set(
            self.base_required_keys + ['grid_type', 'cell_size'])
        self.assertEqual(invalid_keys, expected_missing_keys)

    def test_missing_keys_compute_regression(self):
        """Recreation Validate: assert missing keys for regression option."""
        from natcap.invest.recreation import recmodel_client
        from natcap.invest import validation

        validation_errors = recmodel_client.validate(
            {'compute_regression': True})
        invalid_keys = validation.get_invalid_keys(validation_errors)
        expected_missing_keys = set(
            self.base_required_keys + ['predictor_table_path'])
        self.assertEqual(invalid_keys, expected_missing_keys)

    def test_bad_predictor_table_header(self):
        """Recreation Validate: assert messages for bad table headers."""
        from natcap.invest import recreation, validation

        table_path = os.path.join(self.workspace_dir, 'table.csv')
        with open(table_path, 'w') as file:
            file.write('foo,bar,baz\n')
            file.write('a,b,c\n')

        expected_message = [(
            ['predictor_table_path'],
            validation.MESSAGES['MATCHED_NO_HEADERS'].format(
                header='column', header_name='id'))]
        validation_warnings = recreation.recmodel_client.validate({
            'compute_regression': True,
            'predictor_table_path': table_path,
            'start_year': '2012',
            'end_year': '2016',
            'workspace_dir': self.workspace_dir,
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp')})

        self.assertEqual(validation_warnings, expected_message)

        validation_warnings = recreation.recmodel_client.validate({
            'compute_regression': True,
            'predictor_table_path': table_path,
            'scenario_predictor_table_path': table_path,
            'start_year': '2012',
            'end_year': '2016',
            'workspace_dir': self.workspace_dir,
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp')})
        expected_messages = [
            (['predictor_table_path'],
             validation.MESSAGES['MATCHED_NO_HEADERS'].format(
                header='column', header_name='id')),
            (['scenario_predictor_table_path'],
             validation.MESSAGES['MATCHED_NO_HEADERS'] .format(
                header='column', header_name='id'))]
        self.assertEqual(len(validation_warnings), 2)
        for message in expected_messages:
            self.assertTrue(message in validation_warnings)

    def test_validate_predictor_types_whitespace(self):
        """Recreation Validate: assert type validation ignores whitespace"""
        from natcap.invest.recreation import recmodel_client

        predictor_id = 'dem90m'
        raster_path = os.path.join(SAMPLE_DATA, 'predictors/dem90m_coarse.tif')
        # include trailing whitespace in the type, this should pass
        table_path = os.path.join(self.workspace_dir, 'table.csv')
        with open(table_path, 'w') as file:
            file.write('id,path,type\n')
            file.write(f'{predictor_id},{raster_path},raster_mean \n')

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'cell_size': 40000.0,
            'compute_regression': True,
            'start_year': '2005',
            'end_year': '2014',
            'grid_aoi': False,
            'predictor_table_path': table_path,
            'workspace_dir': self.workspace_dir,
        }

        # there should be no error when the type has trailing whitespace
        recmodel_client.execute(args)
        output_path = os.path.join(self.workspace_dir, 'regression_coefficients.txt')

        # the regression_coefficients.txt output file should contain the
        # predictor id, meaning it wasn't dropped from the regression
        with open(output_path, 'r') as output_file:
            self.assertTrue(predictor_id in ''.join(output_file.readlines()))

    def test_validate_predictor_types_incorrect(self):
        """Recreation Validate: assert error on incorrect type value"""
        from natcap.invest.recreation import recmodel_client

        predictor_id = 'dem90m'
        raster_path = os.path.join(SAMPLE_DATA, 'predictors/dem90m_coarse.tif')
        # include a typo in the type, this should fail
        bad_table_path = os.path.join(self.workspace_dir, 'bad_table.csv')
        with open(bad_table_path, 'w') as file:
            file.write('id,path,type\n')
            file.write(f'{predictor_id},{raster_path},raster?mean\n')

        args = {
            'aoi_path': os.path.join(SAMPLE_DATA, 'andros_aoi.shp'),
            'cell_size': 40000.0,
            'compute_regression': True,
            'start_year': '2005',
            'end_year': '2014',
            'grid_aoi': False,
            'predictor_table_path': bad_table_path,
            'workspace_dir': self.workspace_dir,
        }
        msgs = recmodel_client.validate(args)
        self.assertIn('The table contains invalid type value(s)', msgs[0][1])


def _assert_regression_results_eq(
        workspace_dir, file_list_path, result_vector_path,
        expected_results_path):
    """Test workspace against the expected list of files and results.

    Args:
        workspace_dir (string): path to the completed model workspace
        file_list_path (string): path to a file that has a list of all
            the expected files relative to the workspace base
        result_vector_path (string): path to shapefile
            produced by the Recreation model.
        expected_results_path (string): path to a csv file that has the
            expected results of a scenario prediction model run.

    Returns:
        None

    Raises:
        AssertionError if any files are missing or results are out of
        range by `tolerance_places`
    """
    try:
        # Test that the workspace has the same files as we expect
        _test_same_files(file_list_path, workspace_dir)

        # The tolerance of 3 digits after the decimal was determined by
        # experimentation on the application with the given range of
        # numbers.  This is an apparently reasonable approach as described
        # by ChrisF: http://stackoverflow.com/a/3281371/42897
        # and even more reading about picking numerical tolerance
        # https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
        tolerance_places = 3

        result_vector = gdal.OpenEx(result_vector_path, gdal.OF_VECTOR)
        result_layer = result_vector.GetLayer()
        expected_results = pandas.read_csv(expected_results_path, dtype=float)
        field_names = list(expected_results)
        for feature in result_layer:
            values = [feature.GetField(field) for field in field_names]
            fid = feature.GetFID()
            expected_values = list(expected_results.iloc[fid])
            for v, ev in zip(values, expected_values):
                if v is not None:
                    numpy.testing.assert_allclose(
                        v, ev, rtol=0, atol=10**-tolerance_places)
                else:
                    # Could happen when a raster predictor is only nodata
                    assert(numpy.isnan(ev))
            feature = None

    finally:
        result_layer = None
        gdal.Dataset.__swig_destroy__(result_vector)
        result_vector = None


def _test_same_files(base_list_path, directory_path):
    """Assert expected files are in the `directory_path`.

    Args:
        base_list_path (string): a path to a file that has one relative
            file path per line.
        directory_path (string): a path to a directory whose contents will
            be checked against the files listed in `base_list_file`

    Returns:
        None

    Raises:
        AssertionError when there are files listed in `base_list_file`
            that don't exist in the directory indicated by `path`
    """
    missing_files = []
    with open(base_list_path, 'r') as file_list:
        for file_path in file_list:
            full_path = os.path.join(directory_path, file_path.rstrip())
            if full_path == '':
                # skip blank lines
                continue
            if not os.path.isfile(full_path):
                missing_files.append(full_path)
    if len(missing_files) > 0:
        raise AssertionError(
            "The following files were expected but not found: " +
            '\n'.join(missing_files))