I have started several trials using ray.tune's PB2. They use 8 actors and perturb every 20 steps. Actors 0-6 don't have any trouble, but then actor 7, in the second 20-step epoch, consistently catches an error. In the terminal, I get the following message:
Traceback (most recent call last):
File "./tune_pb2.py", line 303, in <module>
raise_on_failed_trial=False)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/tune.py", line 411, in run
runner.step()
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 572, in step
self.trial_executor.on_no_available_trials(self)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/trial_executor.py", line 183, in on_no_available_trials
raise TuneError("There are paused trials, but no more pending "
ray.tune.error.TuneError: There are paused trials, but no more pending trials with sufficient resources.
I am training with 2 gpus and 2 cpus, one of each for each actor. At this point in the process, actors 0-6 have finished the second epoch and are paused. Actor 7 is the only one running. The error.txt file for that trial contains the following:
Traceback (most recent call last):
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 755, in _process_trial
self, trial, flat_result)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/schedulers/pbt.py", line 415, in on_trial_result
lower_quantile)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/schedulers/pbt.py", line 479, in _perturb_trial
self._exploit(trial_runner.trial_executor, trial, trial_to_clone)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/schedulers/pbt.py", line 532, in _exploit
new_config = self._get_new_config(trial, trial_to_clone)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/schedulers/pb2.py", line 357, in _get_new_config
trial_to_clone.config)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/schedulers/pb2.py", line 174, in explore
X, y, current, newpoint, bounds, num_f=len(t_r.columns))
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/schedulers/pb2.py", line 83, in select_config
m = GPy.models.GPRegression(X, y, kernel)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/paramz/parameterized.py", line 58, in __call__
self.initialize_parameter()
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/paramz/core/parameter_core.py", line 337, in initialize_parameter
self.trigger_update()
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/paramz/core/updateable.py", line 79, in trigger_update
self._trigger_params_changed(trigger_parent)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/paramz/core/parameter_core.py", line 134, in _trigger_params_changed
self.notify_observers(None, None if trigger_parent else -np.inf)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/paramz/core/observable.py", line 91, in notify_observers
[callble(self, which=which) for _, _, callble in self.observers]
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/paramz/core/observable.py", line 91, in <listcomp>
[callble(self, which=which) for _, _, callble in self.observers]
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/paramz/core/parameter_core.py", line 508, in _parameters_changed_notification
self.parameters_changed()
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/GPy/core/gp.py", line 267, in parameters_changed
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/GPy/inference/latent_function_inference/exact_gaussian_inference.py", line 53, in inference
K = kern.K(X)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/GPy/kern/src/kernel_slice_operations.py", line 110, in wrap
ret = f(self, s.X, s.X2, *a, **kw)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/ray/tune/schedulers/pb2_utils.py", line 42, in K
dists = pairwise_distances(T1, T2, "cityblock")
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1779, in pairwise_distances
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1360, in _parallel_pairwise
return func(X, Y, **kwds)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 781, in manhattan_distances
X, Y = check_pairwise_arrays(X, Y)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 147, in check_pairwise_arrays
estimator=estimator)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/utils/validation.py", line 645, in check_array
allow_nan=force_all_finite == 'allow-nan')
File "/home/john/anaconda3/envs/python3.7/lib/python3.7/site-packages/sklearn/utils/validation.py", line 99, in _assert_all_finite
msg_dtype if msg_dtype is not None else X.dtype)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
It looks like the error message arises within the ray.tune code itself, unless I'm missing something. If my tune code is relevant, I can provide that as well.
Any help would be greatly appreciated.