Source code for py21cmmc.mcmc

"""High-level functions for running MCMC chains."""
import logging
import numpy as np
import scipy.stats as stats
from cmath import log
from concurrent.futures import ProcessPoolExecutor
from os import mkdir, path
from py21cmfast import yaml
from py21cmfast._utils import ParameterError

from .cosmoHammer import (
    CosmoHammerSampler,
    HDFStorageUtil,
    LikelihoodComputationChain,
    Params,
)

logger = logging.getLogger("21cmFAST")


[docs] def build_computation_chain(core_modules, likelihood_modules, params=None, setup=True): """ Build a likelihood computation chain from core and likelihood modules. Parameters ---------- core_modules : list A list of objects which define the necessary methods to be core modules (see :mod:`~py21cmmc.core`). likelihood_modules : list A list of objects which define the necessary methods to be likelihood modules (see :mod:`~py21cmmc.likelihood`) params : :class:`~py21cmmc.cosmoHammer.Params`, optional If provided, parameters which will be sampled by the chain. Returns ------- chain : :class:`~py21cmmc.cosmoHammer.LikelihoodComputationChain` """ if not hasattr(core_modules, "__len__"): core_modules = [core_modules] if not hasattr(likelihood_modules, "__len__"): likelihood_modules = [likelihood_modules] chain = LikelihoodComputationChain(params) for cm in core_modules: chain.addCoreModule(cm) for lk in likelihood_modules: chain.addLikelihoodModule(lk) if setup: chain.setup() return chain
[docs] def run_mcmc( core_modules, likelihood_modules, params, datadir=".", model_name="21CMMC", continue_sampling=True, reuse_burnin=True, log_level_21CMMC=None, sampler_cls=CosmoHammerSampler, use_multinest=False, use_zeus=False, use_ultranest=False, **mcmc_options, ) -> CosmoHammerSampler: r"""Run an MCMC chain. Parameters ---------- core_modules : list A list of objects which define the necessary methods to be core modules (see :mod:`~py21cmmc.core`). likelihood_modules : list A list of objects which define the necessary methods to be likelihood modules (see :mod:`~py21cmmc.likelihood`) params : dict Parameters which will be sampled by the chain. Each entry's key specifies the name of the parameter, and its value is an iterable `(val, min, max, width)`, with `val` the initial guess, `min` and `max` the hard boundaries on the parameter's value, and `width` determining the size of the initial ball of walker positions for the parameter. datadir : str, optional Directory to which MCMC info will be written (eg. logs and chain files) model_name : str, optional Name of the model, which determines filenames of outputs. continue_sampling : bool, optional If an output chain file can be found that matches these inputs, sampling can be continued from its last iteration, up to the number of iterations specified. If set to `False`, any output file which matches these parameters will have its samples over-written. reuse_burnin : bool, optional If a pre-computed chain file is found, and `continue_sampling=False`, setting `reuse_burnin` will salvage the burnin part of the chain for re-use, but re-compute the samples themselves. log_level_21CMMC : (int or str, optional) The logging level of the cosmoHammer log file. use_multinest : bool, optional If true, use the MultiNest sampler instead. use_zeus : bool, optional If true, use the zeus sampler instead. use_ultranest : bool, optional If true, use the UltraNest sampler instead. Other Parameters ---------------- \*\*mcmc_options: All other parameters are passed directly to :class:`~py21cmmc.cosmoHammer.CosmoHammerSampler`. These include important options such as ``walkersRatio`` (the number of walkers is ``walkersRatio*nparams``), ``sampleIterations``, ``burninIterations``, ``pool``, ``log_level_stream`` and ``threadCount``. If use_multinest, parameters required by MultiNest as shown below should be provided here. n_live_points : int, optional number of live points importance_nested_sampling : bool, optional If True, Multinest will use Importance Nested Sampling (INS). sampling_efficiency : float, optional defines the sampling efficiency. 0.8 and 0.3 are recommended for parameter estimation & evidence evalutation evidence_tolerance : float, optional A value of 0.5 should give good enough accuracy. max_iter : int, optional maximum number of iterations. 0 is unlimited. multimodal : bool, optional whether or not to detect multi mode write_output : bool, optional write output files? This is required for analysis. If use_zeus, parameters required by zeus as shown below should be provided here. nsteps : int number of steps per iteration (Default is 100) ndim : int number of dimensions to sample over (Default is number of supplied parameters) nwalkers : int number of walkers (Default is 2*ndim) tolerance : float, optional Tuning optimization tolerance (Default is 0.05). patience : int, optional Number of tuning steps to wait to make sure that tuning is done (Default is 5). maxsteps : int, optional Number of maximum stepping-out steps (Default is 10^4). mu : float, optional Scale factor (Default value is 1.0), this will be tuned if tune=True. maxiter : int, optional Number of maximum Expansions/Contractions (Default is 10^4). pool : bool, optional External pool of workers to distribute workload to multiple CPUs (default is None). vectorize : bool, optional If true (default is False), logprob_fn receives not just one point but an array of points, and returns an array of log-probabilities. blobs_dtype : list, optional List containing names and dtypes of blobs metadata e.g. [("log_prior", float), ("mean", float)]. It's useful when you want to save multiple species of metadata. Default is None. verbose : bool, optional If True (default) print log statements. check_walkers : bool, optional If True (default) then check that nwalkers >= 2*ndim and even. shuffle_ensemble : bool, optional If True (default) then shuffle the ensemble of walkers in every iteration before splitting it. light_mode : bool, optional If True (default is False) then no expansions are performed after the tuning phase. This can significantly reduce the number of log likelihood evaluations but works best in target distributions that are apprroximately Gaussian. If use_ultranest, parameters required by UltraNest as shown below should be provided here. log_dir: str where to store output files resume: 'resume', 'resume-similar', 'overwrite' or 'subfolder' If 'overwrite', overwrite previous data. If 'subfolder', create a fresh subdirectory in log_dir. If 'resume' or True, continue previous run if available. Only works when dimensionality, transform or likelihood are consistent. If 'resume-similar', continue previous run if available. Only works when dimensionality and transform are consistent. If a likelihood difference is detected, the existing likelihoods are updated until the live point order differs. Otherwise, behaves like resume. run_num: int or None If resume=='subfolder', this is the subfolder number. Automatically increments if set to None. num_test_samples: int test transform and likelihood with this number of random points for errors first. Useful to catch bugs. vectorized: bool If true, loglike and transform function can receive arrays of points. draw_multiple: bool If efficiency goes down, dynamically draw more points from the region between `ndraw_min` and `ndraw_max`. If set to False, few points are sampled at once. ndraw_min: int Minimum number of points to simultaneously propose. Increase this if your likelihood makes vectorization very cheap. ndraw_max: int Maximum number of points to simultaneously propose. Increase this if your likelihood makes vectorization very cheap. Memory allocation may be slow for extremely high values. num_bootstraps: int number of logZ estimators and MLFriends region bootstrap rounds. warmstart_max_tau: float Maximum disorder to accept when resume='resume-similar'; Live points are reused as long as the live point order is below this normalised Kendall tau distance. Values from 0 (highly conservative) to 1 (extremely negligent). update_interval_volume_fraction: float Update region when the volume shrunk by this amount. log_interval: int Update stdout status line every log_interval iterations. show_status: bool Show integration progress as a status line. If no output desired, set to False. dlogz: float Target evidence uncertainty. This is the std between bootstrapped logz integrators. dKL: float Target posterior uncertainty. This is the Kullback-Leibler divergence in nat between bootstrapped integrators. frac_remain: float Integrate until this fraction of the integral is left in the remainder. Set to a low number (1e-2 ... 1e-5) to make sure peaks are discovered. Set to a higher number (0.5) if you know the posterior is simple. Lepsilon: float Terminate when live point likelihoods are all the same, within Lepsilon tolerance. Increase this when your likelihood function is inaccurate, to avoid unnecessary search. min_ess: int Target number of effective posterior samples. max_iters: int maximum number of integration iterations. max_ncalls: int stop after this many likelihood evaluations. max_num_improvement_loops: int run() tries to assess iteratively where more samples are needed. This number limits the number of improvement loops. min_num_live_points: int minimum number of live points throughout the run cluster_num_live_points: int require at least this many live points per detected cluster insertion_test_zscore_threshold: float z-score used as a threshold for the insertion order test. Set to infinity to disable. insertion_test_window: int Number of iterations after which the insertion order test is reset. region_class: `MLFriends` or `RobustEllipsoidRegion` or `SimpleRegion` Whether to use MLFriends+ellipsoidal+tellipsoidal region (better for multi-modal problems) or just ellipsoidal sampling (faster for high-dimensional, gaussian-like problems) or a axis-aligned ellipsoid (fastest, to be combined with slice sampling). Returns ------- sampler : :class:`~py21cmmc.cosmoHammer.CosmoHammerSampler` instance. The sampler object, from which the chain itself may be accessed (via the ``samples`` attribute). If use_multinest, return multinest sampler. If use_zeus, return zeus sampler. """ file_prefix = path.join(datadir, model_name) # check that only one sampler is specified if sum([use_multinest, use_ultranest, use_zeus]) > 1: raise ValueError("You cannot use more than one sampler at the time!") if use_multinest: n_live_points = mcmc_options.get("n_live_points", 100) importance_nested_sampling = mcmc_options.get( "importance_nested_sampling", True ) sampling_efficiency = mcmc_options.get("sampling_efficiency", 0.8) evidence_tolerance = mcmc_options.get("evidence_tolerance", 0.5) max_iter = mcmc_options.get("max_iter", 50) multimodal = mcmc_options.get("multimodal", True) write_output = mcmc_options.get("write_output", True) datadir = datadir + "/MultiNest/" try: from pymultinest import run except ImportError: raise ImportError("You need to install pymultinest to use this function!") try: mkdir(datadir) except FileExistsError: pass if use_zeus: ndim = mcmc_options.get( "ndim", 2 ) # Number of parameters/dimensions (e.g. m and c) nwalkers = mcmc_options.get( "nwalkers", 10 ) # Number of walkers to use. It should be at least twice the number of dimensions. nsteps = mcmc_options.get("nsteps", 100) # Number of steps/iterations. # set up parameters params = Params(*[(k, v) for k, v in params.items()]) # initial positions of the walkers start = np.asarray( [ stats.truncnorm.rvs( (params[i][1] - params[i][0]) / params[i][-1], (params[i][2] - params[i][0]) / params[i][-1], loc=params[i][0], scale=params[i][-1], size=nwalkers, ) for i in range(ndim) ] ).T tolerance = mcmc_options.get("tolerance", 0.05) patience = mcmc_options.get("patience", 5) maxsteps = mcmc_options.get("maxsteps", 1e4) mu = mcmc_options.get("mu", 1.0) maxiter = mcmc_options.get("maxiter", 1e4) pool = mcmc_options.get("pool", None) vectorize = mcmc_options.get("vectorize", False) blobs_dtype = mcmc_options.get("blobs_dtype", None) verbose = mcmc_options.get("vectorize", True) check_walkers = mcmc_options.get("check_walkers", True) shuffle_ensemble = mcmc_options.get("shuffle_ensemble", True) light_mode = mcmc_options.get("light_mode", False) try: import zeus except ImportError: raise ImportError("You need to install zeus to use this function!") if use_ultranest: try: import ultranest except ImportError: raise ImportError("You need to install ultranest to use this function!") log_dir = mcmc_options.get("log_dir", None) resume = mcmc_options.get("resume", "subfolder") run_num = mcmc_options.get("run_num", None) num_test_samples = mcmc_options.get("num_test_samples", 2) vectorized = mcmc_options.get("vectorized", False) draw_multiple = mcmc_options.get("draw_multiple", True) ndraw_min = mcmc_options.get("ndraw_min", 128) ndraw_max = mcmc_options.get("ndraw_max", 65536) num_bootstraps = mcmc_options.get("num_bootstraps", 30) warmstart_max_tau = mcmc_options.get("warmstart_max_tau", -1) update_interval_volume_fraction = mcmc_options.get( "update_interval_volume_fraction", 0.8 ) log_interval = mcmc_options.get("log_interval", None) show_status = mcmc_options.get("show_status", True) dlogz = mcmc_options.get("dlogz", 0.5) dKL = mcmc_options.get("dKL", 0.5) frac_remain = mcmc_options.get("frac_remain", 0.1) Lepsilon = mcmc_options.get("Lepsilon", 0.001) min_ess = mcmc_options.get("min_ess", 400) max_iters = mcmc_options.get("max_iters", None) max_ncalls = mcmc_options.get("max_ncalls", None) max_num_improvement_loops = mcmc_options.get("max_num_improvement_loops", -1) min_num_live_points = mcmc_options.get("min_num_live_points", 400) cluster_num_live_points = mcmc_options.get("cluster_num_live_points", 40) insertion_test_zscore_threshold = mcmc_options.get( "insertion_test_zscore_threshold", 4 ) insertion_test_window = mcmc_options.get("insertion_test_window", 10) region_class = mcmc_options.get("region_class", ultranest.mlfriends.MLFriends) # logging setup log_level_ultranest = ( log_level_21CMMC if log_level_21CMMC is not None else "INFO" ) ultranest_logger = logging.getLogger("ultranest") ultranest_logger.addHandler(logging.NullHandler()) ultranest_logger.setLevel(log_level_ultranest) # Setup parameters. if not isinstance(params, Params): params = Params(*[(k, v) for k, v in params.items()]) chain = build_computation_chain( core_modules, likelihood_modules, params, setup=False ) if continue_sampling and not (use_multinest or use_zeus or use_ultranest): try: with open(file_prefix + ".LCC.yml") as f: old_chain = yaml.load(f) if old_chain != chain: raise RuntimeError( "Attempting to continue chain, but chain parameters are different. " + "Check your parameters against {file_prefix}.LCC.yml".format( file_prefix=file_prefix ) ) except FileNotFoundError: pass # We need to ensure that simulate=False if trying to continue sampling. for lk in chain.getLikelihoodModules(): if hasattr(lk, "_simulate") and lk._simulate: logger.warning( f"Likelihood {lk} was defined to re-simulate data/noise, but this is incompatible with" "`continue_sampling`. Setting simulate=False and continuing..." ) lk._simulate = False # Write out the parameters *before* setup. # TODO: not sure if this is the best idea -- should it be after setup()? try: with open(file_prefix + ".LCC.yml", "w") as f: yaml.dump(chain, f) except Exception as e: logger.warning( "Attempt to write out YAML file containing LikelihoodComputationChain failed. " "Boldly continuing..." ) print(e) chain.setup() # Set logging levels if log_level_21CMMC is not None: logging.getLogger("21CMMC").setLevel(log_level_21CMMC) if use_multinest: def likelihood(p, ndim, nparams): try: return chain.computeLikelihoods( chain.build_model_data( Params(*[(k, v) for k, v in zip(params.keys, p)]) ) ) except ParameterError: return -np.inf def prior(p, ndim, nparams): for i in range(ndim): p[i] = params[i][1] + p[i] * (params[i][2] - params[i][1]) try: sampler = run( likelihood, prior, n_dims=len(params.keys), n_params=len(params.keys), n_live_points=n_live_points, resume=continue_sampling, write_output=write_output, outputfiles_basename=datadir + model_name, max_iter=max_iter, importance_nested_sampling=importance_nested_sampling, multimodal=multimodal, evidence_tolerance=evidence_tolerance, sampling_efficiency=sampling_efficiency, init_MPI=False, ) return 1 except OSError: # pragma: nocover raise ImportError( "You also need to build MultiNest library. See https://johannesbuchner.github.io/PyMultiNest/install.html#id4 for more information." ) elif use_zeus: def prior(p): for i, value in enumerate(p): if (value > params[i][2]) or (value < params[i][1]): return p, True return p, False def likelihood(p): try: return chain.computeLikelihoods( chain.build_model_data( Params(*[(k, v) for k, v in zip(params.keys, p)]) ) ) except ParameterError: return -np.inf def posterior(p): # pass point into prior to check if in bounds p, inf = prior(p) if inf: return -np.inf # if in bounds, evaluate likelihood log_prob = likelihood(p) return log_prob sampler = zeus.EnsembleSampler( nwalkers, ndim, posterior, tolerance=tolerance, patience=patience, maxsteps=maxsteps, mu=mu, maxiter=maxiter, pool=pool, vectorize=vectorize, blobs_dtype=blobs_dtype, verbose=verbose, check_walkers=check_walkers, shuffle_ensemble=shuffle_ensemble, light_mode=light_mode, ) # Initialise the sampler sampler.run_mcmc(start, nsteps) # Run sampling return sampler elif use_ultranest: def likelihood(p): if vectorized: return chain.computeLikelihoods( chain.build_model_data( Params(*[(k, v) for k, v in zip(params.keys, p.T)]) ) ) else: try: return chain.computeLikelihoods( chain.build_model_data( Params(*[(k, v) for k, v in zip(params.keys, p)]) ) ) except ParameterError: return -np.inf def prior(p): t = np.empty(p.shape, dtype=p.dtype) for i in range(p.shape[-1]): if vectorized: t[:, i] = params[i][1] + p[:, i] * (params[i][2] - params[i][1]) else: t[i] = params[i][1] + p[i] * (params[i][2] - params[i][1]) return t sampler = ultranest.ReactiveNestedSampler( params.keys, loglike=likelihood, transform=prior, resume=resume, run_num=run_num, log_dir=log_dir, num_test_samples=num_test_samples, draw_multiple=draw_multiple, num_bootstraps=num_bootstraps, vectorized=vectorized, ndraw_min=ndraw_min, ndraw_max=ndraw_max, warmstart_max_tau=warmstart_max_tau, ) result = sampler.run( update_interval_volume_fraction=update_interval_volume_fraction, log_interval=log_interval, show_status=show_status, dlogz=dlogz, dKL=dKL, frac_remain=frac_remain, Lepsilon=Lepsilon, min_ess=min_ess, max_iters=max_iters, max_ncalls=max_ncalls, max_num_improvement_loops=max_num_improvement_loops, min_num_live_points=min_num_live_points, cluster_num_live_points=cluster_num_live_points, insertion_test_window=insertion_test_window, insertion_test_zscore_threshold=insertion_test_zscore_threshold, region_class=region_class, ) return sampler, result else: pool = mcmc_options.pop( "pool", ProcessPoolExecutor(max_workers=mcmc_options.get("threadCount", 1)), ) sampler = sampler_cls( continue_sampling=continue_sampling, likelihoodComputationChain=chain, storageUtil=HDFStorageUtil(file_prefix), filePrefix=file_prefix, reuseBurnin=reuse_burnin, pool=pool, **mcmc_options, ) # The sampler writes to file, so no need to save anything ourselves. sampler.startSampling() return sampler