Mercurial > lbo > hg > localmr
changeset 63:10e3e2efbc57
Add reduce group size parameter
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sat, 06 Feb 2016 00:07:26 +0000 |
parents | 1608d4061077 |
children | 33f3fcc22c02 |
files | src/parameters.rs |
diffstat | 1 files changed, 19 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parameters.rs Sat Feb 06 00:07:01 2016 +0000 +++ b/src/parameters.rs Sat Feb 06 00:07:26 2016 +0000 @@ -10,6 +10,8 @@ pub map_partition_size: usize, + pub reduce_group_size: usize, + // Internal parameters pub shard_id: usize, } @@ -21,6 +23,7 @@ mappers: 4, reducers: 4, map_partition_size: 100 * 1024 * 1024, + reduce_group_size: 1, shard_id: 0, } } @@ -28,6 +31,8 @@ /// An implementation detail: When processing the data during the map phase, this /// parameter determines how many keys are processed in direct sequence. Heavily increasing /// this value increases memory usage. + /// + /// Default 256 pub fn set_key_buffer_size(mut self, n: usize) -> MRParameters { self.key_buffer_size = n; self @@ -36,6 +41,8 @@ /// Determines how many parallel processes will be run. Mappers and reducers do in general /// not run at the same time (as the reducers need to wait for the map output). The number of /// reducers also determines the sharding of the map output data. + /// + /// Default 4/4 pub fn set_concurrency(mut self, mappers: usize, reducers: usize) -> MRParameters { self.mappers = mappers; self.reducers = reducers; @@ -48,12 +55,24 @@ /// In general: All input data of one chunk will be in memory; all output data will be in /// memory, too; but both are not in memory at the full size at the same time (as input data /// are consumed the output data builds up, and the memory taken up by the former is released). + /// + /// Default 100 MiB pub fn set_partition_size(mut self, size: usize) -> MRParameters { self.map_partition_size = size; self } + /// How big are the groups of keys in the reduce phase expected to be? (used for pre-allocating + /// buffers) + /// + /// Default 1. + pub fn set_reduce_group_size(mut self, size: usize) -> MRParameters { + self.reduce_group_size = size; + self + } + /// For internal use: Sets the ID of the executing data chunk (for file naming etc.) + /// pub fn set_shard_id(mut self, n: usize) -> MRParameters { self.shard_id = n; self