Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 41 additions & 27 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ def _is_1d_collection(data: Any) -> bool:


def _list_to_1d_numpy(
*,
data: Any,
dtype: "np.typing.DTypeLike",
name: str,
Expand Down Expand Up @@ -1833,7 +1834,7 @@ def __del__(self) -> None:
except AttributeError:
pass

def _create_sample_indices(self, total_nrow: int) -> np.ndarray:
def _create_sample_indices(self, *, total_nrow: int) -> np.ndarray:
"""Get an array of randomly chosen indices from this ``Dataset``.

Indices are sampled without replacement.
Expand Down Expand Up @@ -2160,26 +2161,26 @@ def _lazy_init(
)
)
elif isinstance(data, scipy.sparse.csr_matrix):
self.__init_from_csr(data, params_str, ref_dataset)
self.__init_from_csr(csr=data, params_str=params_str, ref_dataset=ref_dataset)
elif isinstance(data, scipy.sparse.csc_matrix):
self.__init_from_csc(data, params_str, ref_dataset)
self.__init_from_csc(csc=data, params_str=params_str, ref_dataset=ref_dataset)
elif isinstance(data, np.ndarray):
self.__init_from_np2d(data, params_str, ref_dataset)
self.__init_from_np2d(mat=data, params_str=params_str, ref_dataset=ref_dataset)
elif _is_pyarrow_table(data):
self.__init_from_pyarrow_table(data, params_str, ref_dataset)
self.__init_from_pyarrow_table(table=data, params_str=params_str, ref_dataset=ref_dataset)
elif isinstance(data, list) and len(data) > 0:
if _is_list_of_numpy_arrays(data):
self.__init_from_list_np2d(data, params_str, ref_dataset)
self.__init_from_list_np2d(mats=data, params_str=params_str, ref_dataset=ref_dataset)
elif _is_list_of_sequences(data):
self.__init_from_seqs(data, ref_dataset)
self.__init_from_seqs(seqs=data, ref_dataset=ref_dataset)
else:
raise TypeError("Data list can only be of ndarray or Sequence")
elif isinstance(data, Sequence):
self.__init_from_seqs([data], ref_dataset)
self.__init_from_seqs(seqs=[data], ref_dataset=ref_dataset)
else:
try:
csr = scipy.sparse.csr_matrix(data)
self.__init_from_csr(csr, params_str, ref_dataset)
self.__init_from_csr(csr=csr, params_str=params_str, ref_dataset=ref_dataset)
except BaseException as err:
raise TypeError(f"Cannot initialize Dataset from {type(data).__name__}") from err
if label is not None:
Expand Down Expand Up @@ -2218,7 +2219,7 @@ def _yield_row_from_seqlist(seqs: List[Sequence], indices: Iterable[int]) -> Ite
row = seq[id_in_seq]
yield row if row.flags["OWNDATA"] else row.copy()

def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarray], List[np.ndarray]]:
def __sample(self, *, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarray], List[np.ndarray]]:
"""Sample data from seqs.

Mimics behavior in c_api.cpp:LGBM_DatasetCreateFromMats()
Expand All @@ -2227,7 +2228,7 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr
-------
sampled_rows, sampled_row_indices
"""
indices = self._create_sample_indices(total_nrow)
indices = self._create_sample_indices(total_nrow=total_nrow)

# Select sampled rows, transpose to column order.
sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices)))
Expand All @@ -2248,6 +2249,7 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr

def __init_from_seqs(
self,
*,
seqs: List[Sequence],
ref_dataset: Optional[_DatasetHandle],
) -> "Dataset":
Expand All @@ -2268,7 +2270,7 @@ def __init_from_seqs(
param_str = _param_dict_to_str(self.get_params())
sample_cnt = _get_sample_count(total_nrow, param_str)

sample_data, col_indices = self.__sample(seqs, total_nrow)
sample_data, col_indices = self.__sample(seqs=seqs, total_nrow=total_nrow)
self._init_from_sample(sample_data, col_indices, sample_cnt, total_nrow)

for seq in seqs:
Expand All @@ -2281,6 +2283,7 @@ def __init_from_seqs(

def __init_from_np2d(
self,
*,
mat: np.ndarray,
params_str: str,
ref_dataset: Optional[_DatasetHandle],
Expand Down Expand Up @@ -2308,6 +2311,7 @@ def __init_from_np2d(

def __init_from_list_np2d(
self,
*,
mats: List[np.ndarray],
params_str: str,
ref_dataset: Optional[_DatasetHandle],
Expand Down Expand Up @@ -2362,6 +2366,7 @@ def __init_from_list_np2d(

def __init_from_csr(
self,
*,
csr: scipy.sparse.csr_matrix,
params_str: str,
ref_dataset: Optional[_DatasetHandle],
Expand Down Expand Up @@ -2396,6 +2401,7 @@ def __init_from_csr(

def __init_from_csc(
self,
*,
csc: scipy.sparse.csc_matrix,
params_str: str,
ref_dataset: Optional[_DatasetHandle],
Expand Down Expand Up @@ -2430,6 +2436,7 @@ def __init_from_csc(

def __init_from_pyarrow_table(
self,
*,
table: pa_Table,
params_str: str,
ref_dataset: Optional[_DatasetHandle],
Expand Down Expand Up @@ -2459,6 +2466,7 @@ def __init_from_pyarrow_table(

@staticmethod
def _compare_params_for_warning(
*,
params: Dict[str, Any],
other_params: Dict[str, Any],
ignore_keys: Set[str],
Expand Down Expand Up @@ -2528,7 +2536,11 @@ def construct(self) -> "Dataset":
)
else:
# construct subset
used_indices = _list_to_1d_numpy(self.used_indices, dtype=np.int32, name="used_indices")
used_indices = _list_to_1d_numpy(
data=self.used_indices,
dtype=np.int32,
name="used_indices",
)
assert used_indices.flags.c_contiguous
if self.reference.group is not None:
group_info = np.array(self.reference.group).astype(np.int32, copy=False)
Expand Down Expand Up @@ -2796,9 +2808,9 @@ def set_field(
if field_name == "init_score":
dtype = np.float64
if _is_1d_collection(data):
data = _list_to_1d_numpy(data, dtype=dtype, name=field_name)
data = _list_to_1d_numpy(data=data, dtype=dtype, name=field_name)
elif _is_2d_collection(data):
data = _data_to_2d_numpy(data, dtype=dtype, name=field_name)
data = _data_to_2d_numpy(data=data, dtype=dtype, name=field_name)
data = data.ravel(order="F")
else:
raise TypeError(
Expand All @@ -2810,7 +2822,7 @@ def set_field(
dtype = np.int32
else:
dtype = np.float32
data = _list_to_1d_numpy(data, dtype=dtype, name=field_name)
data = _list_to_1d_numpy(data=data, dtype=dtype, name=field_name)

ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr]
if data.dtype == np.float32 or data.dtype == np.float64:
Expand Down Expand Up @@ -3051,7 +3063,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset":
elif _is_pyarrow_array(label):
label_array = label
else:
label_array = _list_to_1d_numpy(label, dtype=np.float32, name="label")
label_array = _list_to_1d_numpy(data=label, dtype=np.float32, name="label")
self.set_field("label", label_array)
self.label = self.get_field("label") # original values can be modified at cpp side
return self
Expand Down Expand Up @@ -3084,7 +3096,7 @@ def set_weight(
# Set field
if self._handle is not None and weight is not None:
if not _is_pyarrow_array(weight):
weight = _list_to_1d_numpy(weight, dtype=np.float32, name="weight")
weight = _list_to_1d_numpy(data=weight, dtype=np.float32, name="weight")
self.set_field("weight", weight)
self.weight = self.get_field("weight") # original values can be modified at cpp side
return self
Expand Down Expand Up @@ -3134,7 +3146,7 @@ def set_group(
self.group = group
if self._handle is not None and group is not None:
if not _is_pyarrow_array(group):
group = _list_to_1d_numpy(group, dtype=np.int32, name="group")
group = _list_to_1d_numpy(data=group, dtype=np.int32, name="group")
self.set_field("group", group)
# original values can be modified at cpp side
constructed_group = self.get_field("group")
Expand All @@ -3160,7 +3172,7 @@ def set_position(
"""
self.position = position
if self._handle is not None and position is not None:
position = _list_to_1d_numpy(position, dtype=np.int32, name="position")
position = _list_to_1d_numpy(data=position, dtype=np.int32, name="position")
self.set_field("position", position)
return self

Expand Down Expand Up @@ -3877,6 +3889,7 @@ def _get_node_index(
return f"{tree_num}{node_type}{node_num}"

def _get_split_feature(
*,
tree: Dict[str, Any],
feature_names: Optional[List[str]],
) -> Optional[str]:
Expand All @@ -3900,7 +3913,7 @@ def _is_single_node_tree(tree: Dict[str, Any]) -> bool:
node["left_child"] = None
node["right_child"] = None
node["parent_index"] = parent_node
node["split_feature"] = _get_split_feature(tree, feature_names)
node["split_feature"] = _get_split_feature(tree=tree, feature_names=feature_names)
node["split_gain"] = None
node["threshold"] = None
node["decision_type"] = None
Expand Down Expand Up @@ -4125,11 +4138,12 @@ def update(
else:
if not self.__set_objective_to_none:
self.reset_parameter({"objective": "none"}).__set_objective_to_none = True
grad, hess = fobj(self.__inner_predict(0), self.train_set)
return self.__boost(grad, hess)
grad, hess = fobj(self.__inner_predict(data_idx=0), self.train_set)
return self.__boost(grad=grad, hess=hess)

def __boost(
self,
*,
grad: np.ndarray,
hess: np.ndarray,
) -> bool:
Expand Down Expand Up @@ -4164,8 +4178,8 @@ def __boost(
if self.__num_class > 1:
grad = grad.ravel(order="F")
hess = hess.ravel(order="F")
grad = _list_to_1d_numpy(grad, dtype=np.float32, name="gradient")
hess = _list_to_1d_numpy(hess, dtype=np.float32, name="hessian")
grad = _list_to_1d_numpy(data=grad, dtype=np.float32, name="gradient")
hess = _list_to_1d_numpy(data=hess, dtype=np.float32, name="hessian")
assert grad.flags.c_contiguous
assert hess.flags.c_contiguous
if len(grad) != len(hess):
Expand Down Expand Up @@ -5171,7 +5185,7 @@ def __inner_eval(
for eval_function in feval:
if eval_function is None:
continue
feval_ret = eval_function(self.__inner_predict(data_idx), cur_data)
feval_ret = eval_function(self.__inner_predict(data_idx=data_idx), cur_data)
if isinstance(feval_ret, list):
for eval_name, val, is_higher_better in feval_ret:
ret.append((data_name, eval_name, val, is_higher_better))
Expand All @@ -5180,7 +5194,7 @@ def __inner_eval(
ret.append((data_name, eval_name, val, is_higher_better))
return ret

def __inner_predict(self, data_idx: int) -> np.ndarray:
def __inner_predict(self, *, data_idx: int) -> np.ndarray:
"""Predict for training and validation dataset."""
if data_idx >= self.__num_dataset:
raise ValueError("Data_idx should be smaller than number of dataset")
Expand Down
6 changes: 3 additions & 3 deletions python-package/lightgbm/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,13 +304,13 @@ def _reset_storages(self) -> None:
self.cmp_op: List[Callable[[float, float], bool]] = []
self.first_metric = ""

def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
def _gt_delta(self, curr_score: float, best_score: float, *, delta: float) -> bool:
return curr_score > best_score + delta

def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
def _lt_delta(self, curr_score: float, best_score: float, *, delta: float) -> bool:
return curr_score < best_score - delta

def _is_train_set(self, dataset_name: str, env: CallbackEnv) -> bool:
def _is_train_set(self, *, dataset_name: str, env: CallbackEnv) -> bool:
"""Check, by name, if a given Dataset is the training data."""
# for lgb.cv() with eval_train_metric=True, evaluation is also done on the training set
# and those metrics are considered for early stopping
Expand Down
25 changes: 18 additions & 7 deletions python-package/lightgbm/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def _get_dask_client(client: Optional[Client]) -> Client:


def _assign_open_ports_to_workers(
*,
client: Client,
workers: List[str],
) -> Tuple[Dict[str, Future], Dict[str, int]]:
Expand Down Expand Up @@ -165,7 +166,11 @@ def _remove_list_padding(*args: Any) -> List[List[Any]]:
return [[z for z in arg if z is not None] for arg in args]


def _pad_eval_names(lgbm_model: LGBMModel, required_names: List[str]) -> LGBMModel:
def _pad_eval_names(
*,
lgbm_model: LGBMModel,
required_names: List[str],
) -> LGBMModel:
"""Append missing (key, value) pairs to a LightGBM model's evals_result_ and best_score_ OrderedDict attrs based on a set of required eval_set names.

Allows users to rely on expected eval_set names being present when fitting DaskLGBM estimators with ``eval_set``.
Expand Down Expand Up @@ -356,12 +361,12 @@ def _train_part(

if n_evals:
# ensure that expected keys for evals_result_ and best_score_ exist regardless of padding.
model = _pad_eval_names(model, required_names=evals_result_names)
model = _pad_eval_names(lgbm_model=model, required_names=evals_result_names)

return model if return_model else None


def _split_to_parts(data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]:
def _split_to_parts(*, data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]:
parts = data.to_delayed()
if isinstance(parts, np.ndarray):
if is_matrix:
Expand All @@ -372,7 +377,11 @@ def _split_to_parts(data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]:
return parts


def _machines_to_worker_map(machines: str, worker_addresses: Iterable[str]) -> Dict[str, int]:
def _machines_to_worker_map(
*,
machines: str,
worker_addresses: Iterable[str],
) -> Dict[str, int]:
"""Create a worker_map from machines list.

Given ``machines`` and a list of Dask worker addresses, return a mapping where the keys are
Expand Down Expand Up @@ -773,7 +782,8 @@ def _train(
else:
_log_info("Finding random open ports for workers")
worker_to_socket_future, worker_address_to_port = _assign_open_ports_to_workers(
client, list(worker_map.keys())
client=client,
workers=list(worker_map.keys()),
)

machines = ",".join(
Expand Down Expand Up @@ -1091,20 +1101,21 @@ def _lgb_dask_fit(
)

self.set_params(**model.get_params()) # type: ignore[attr-defined]
self._lgb_dask_copy_extra_params(model, self) # type: ignore[attr-defined]
self._lgb_dask_copy_extra_params(source=model, dest=self) # type: ignore[attr-defined]

return self

def _lgb_dask_to_local(self, model_factory: Type[LGBMModel]) -> LGBMModel:
params = self.get_params() # type: ignore[attr-defined]
params.pop("client", None)
model = model_factory(**params)
self._lgb_dask_copy_extra_params(self, model)
self._lgb_dask_copy_extra_params(source=self, dest=model)
model._other_params.pop("client", None)
return model

@staticmethod
def _lgb_dask_copy_extra_params(
*,
source: Union["_DaskLGBMModel", LGBMModel],
dest: Union["_DaskLGBMModel", LGBMModel],
) -> None:
Expand Down
Loading
Loading