Source code for flwr_datasets.partitioner.natural_id_partitioner
# Copyright 2023 Flower Labs GmbH. All Rights Reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# =============================================================================="""Natural id partitioner class that works with Hugging Face Datasets."""importnumpyasnpfromtqdmimporttqdmimportdatasetsfromflwr_datasets.common.typingimportNDArrayIntfromflwr_datasets.partitioner.partitionerimportPartitioner
[docs]classNaturalIdPartitioner(Partitioner):"""Partitioner for a dataset that can be divided by a column with partition ids. Parameters ---------- partition_by: str The name of the column that contains the unique values of partitions. Examples -------- "flwrlabs/shakespeare" dataset >>> from flwr_datasets import FederatedDataset >>> from flwr_datasets.partitioner import NaturalIdPartitioner >>> >>> partitioner = NaturalIdPartitioner(partition_by="character_id") >>> fds = FederatedDataset(dataset="flwrlabs/shakespeare", >>> partitioners={"train": partitioner}) >>> partition = fds.load_partition(0) "sentiment140" (aka Twitter) dataset >>> from flwr_datasets import FederatedDataset >>> from flwr_datasets.partitioner import NaturalIdPartitioner >>> >>> partitioner = NaturalIdPartitioner(partition_by="user") >>> fds = FederatedDataset(dataset="sentiment140", >>> partitioners={"train": partitioner}) >>> partition = fds.load_partition(0) """def__init__(self,partition_by:str,):super().__init__()self._partition_id_to_natural_id:dict[int,str]={}self._natural_id_to_partition_id:dict[str,int]={}self._partition_id_to_indices:dict[int,NDArrayInt]={}self._partition_by=partition_bydef_create_int_partition_id_to_natural_id(self)->None:"""Create a mapping from int indices to unique client ids from dataset. Natural ids come from the column specified in `partition_by`. """unique_natural_ids=self.dataset.unique(self._partition_by)self._partition_id_to_natural_id=dict(zip(range(len(unique_natural_ids)),unique_natural_ids))def_create_natural_id_to_int_partition_id(self)->None:"""Create a mapping from unique client ids from dataset to int indices. Natural ids come from the column specified in `partition_by`. This object is inverse of the `self._partition_id_to_natural_id`. This method assumes that `self._partition_id_to_natural_id` already exist. """self._natural_id_to_partition_id={value:keyforkey,valueinself._partition_id_to_natural_id.items()}def_create_partition_id_to_indices(self)->None:natural_id_to_indices={}# type: ignorenatural_ids=np.array(self.dataset[self._partition_by])forindex,natural_idintqdm(enumerate(natural_ids),desc="Generating partition_id_to_indices"):ifnatural_idnotinnatural_id_to_indices:natural_id_to_indices[natural_id]=[]natural_id_to_indices[natural_id].append(index)self._partition_id_to_indices={self._natural_id_to_partition_id[natural_id]:indicesfornatural_id,indicesinnatural_id_to_indices.items()}
[docs]defload_partition(self,partition_id:int)->datasets.Dataset:"""Load a single partition corresponding to a single `partition_id`. The choice of the partition is based on unique integers assigned to each natural id present in the dataset in the `partition_by` column. Parameters ---------- partition_id : int the index that corresponds to the requested partition Returns ------- dataset_partition : Dataset single dataset partition """iflen(self._partition_id_to_natural_id)==0:self._create_int_partition_id_to_natural_id()self._create_natural_id_to_int_partition_id()iflen(self._partition_id_to_indices)==0:self._create_partition_id_to_indices()returnself.dataset.select(self._partition_id_to_indices[partition_id])
@propertydefnum_partitions(self)->int:"""Total number of partitions."""iflen(self._partition_id_to_natural_id)==0:self._create_int_partition_id_to_natural_id()self._create_natural_id_to_int_partition_id()returnlen(self._partition_id_to_natural_id)@propertydefpartition_id_to_natural_id(self)->dict[int,str]:"""Node id to corresponding natural id present. Natural ids are the unique values in `partition_by` column in dataset. """returnself._partition_id_to_natural_id@partition_id_to_natural_id.setterdefpartition_id_to_natural_id(self,value:dict[int,str])->None:raiseAttributeError("Setting the partition_id_to_natural_id dictionary is not allowed.")