XGBoost ranker training input data format on Ray

21 views Asked by At

I was trying to train a xgboost ranker model on ray with BigQueryDatasource (data can be large).

The cluster has been setup that other xgb models (such as logistic) work well. I tried to use xgboost_ray.RayDMatrix as the input since the ranker model requires the extra qid column, and it is the only way I found that can pass in those information. But still, I got errors that are hard to understand.

from vertex_ray import BigQueryDatasource
from xgboost_ray import RayDMatrix, RayParams, train

train_dataset = ray.data.read_datasource(
    BigQueryDatasource(),
    query="SELECT * FROM my_proj.my_dataset.my_table",
)
train_dataset.fully_executed()

ray_dmatrix = xgboost_ray.RayDMatrix(train_dataset, label="y", qid="qid")

bst = train(
    {
        "objective": "rank:ndcg",
    },
    ray_dmatrix,
    ray_params=RayParams(
        num_actors=2,  # Number of remote actors
        cpus_per_actor=1))

bst.save_model("model.xgb")


(_wrapped pid=909) 2024-02-27 01:49:36,502      INFO main.py:1126 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.
(_wrapped pid=909) 2024-02-27 01:49:41,995      INFO main.py:1177 -- [RayXGBoost] Starting XGBoost training.
(_RemoteRayXGBoostActor pid=229, ip=10.16.0.164) [01:49:42] task [xgboost.ray]:137063672512960 got new rank 1
(_RemoteRayXGBoostActor pid=228, ip=10.16.0.164) [01:49:42] task [xgboost.ray]:135302736939760 got new rank 0
(_wrapped pid=909) 2024-02-27 01:49:44,679      INFO main.py:1694 -- [RayXGBoost] Finished XGBoost training on training data with total N=100 in 8.25 seconds (2.67 pure XGBoost training time).
---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-22-2d4c36d612fd> in <cell line: 1>()
----> 1 bst = train(
      2     {
      3         "objective": "rank:ndcg",
      4     },
      5     ray_dmatrix,

7 frames
~/.local/lib/python3.10/site-packages/xgboost_ray/main.py in train(params, dtrain, num_boost_round, evals, evals_result, additional_results, ray_params, _remote, *args, **kwargs)
   1408         _wrapped = force_on_current_node(_wrapped)
   1409 
-> 1410         bst, train_evals_result, train_additional_results = ray.get(
   1411             _wrapped.remote(
   1412                 params,

~/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
    102             # we only convert init function if RAY_CLIENT_MODE=1
    103             if func.__name__ != "init" or is_client_mode_enabled_by_default:
--> 104                 return getattr(ray, func.__name__)(*args, **kwargs)
    105         return func(*args, **kwargs)
    106 

~/.local/lib/python3.10/site-packages/ray/util/client/api.py in get(self, vals, timeout)
     40             timeout: Optional timeout in milliseconds
     41         """
---> 42         return self.worker.get(vals, timeout=timeout)
     43 
     44     def put(self, *args, **kwargs):

~/.local/lib/python3.10/site-packages/ray/util/client/worker.py in get(self, vals, timeout)
    432                 op_timeout = max_blocking_operation_time
    433             try:
--> 434                 res = self._get(to_get, op_timeout)
    435                 break
    436             except GetTimeoutError:

~/.local/lib/python3.10/site-packages/ray/util/client/worker.py in _get(self, ref, timeout)
    476         except grpc.RpcError as e:
    477             raise decode_exception(e)
--> 478         return loads_from_server(data)
    479 
    480     def put(

~/.local/lib/python3.10/site-packages/ray/util/client/client_pickler.py in loads_from_server(data, fix_imports, encoding, errors)
    176     return ServerUnpickler(
    177         file, fix_imports=fix_imports, encoding=encoding, errors=errors
--> 178     ).load()
    179 
    180 

~/.local/lib/python3.10/site-packages/xgboost/core.py in __setstate__(self, state)
   1679             length = c_bst_ulong(len(buf))
   1680             ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
-> 1681             _check_call(
   1682                 _LIB.XGBoosterUnserializeFromBuffer(handle, ptr, length))
   1683             state['handle'] = handle

~/.local/lib/python3.10/site-packages/xgboost/core.py in _check_call(ret)
    277     """
    278     if ret != 0:
--> 279         raise XGBoostError(py_str(_LIB.XGBGetLastError()))
    280 
    281 

XGBoostError: [01:49:45] ../include/xgboost/json.h:81: Invalid cast, from Null to Object
Stack trace:
  [bt] (0) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x139553) [0x7fed1bd39553]
  [bt] (1) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x1638f5) [0x7fed1bd638f5]
  [bt] (2) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x66ab91) [0x7fed1c26ab91]
  [bt] (3) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x66b665) [0x7fed1c26b665]
  [bt] (4) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x2e8ace) [0x7fed1bee8ace]
  [bt] (5) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x2f1fc1) [0x7fed1bef1fc1]
  [bt] (6) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGBoosterUnserializeFromBuffer+0x65) [0x7fed1bd3da55]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7fedd78f4e2e]
  [bt] (8) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7fedd78f1493]

Similar data and code piece works with local xgboost and xgb.DMatrix

from xgboost import train, DMatrix
dm = DMatrix(X, label=y, qid=qid)
bst = train(
    {
        "objective": "rank:ndcg",
    },
    dm)

Anyone knows why and what is the right way to do that?

0

There are 0 answers