view src/mapreducer.rs @ 37:bda56504a12c

Add ShardMergeIterator type. This type is used to merge the shards that are the input to the reduce phase. See https://drive.google.com/open?id=1grB87a0w9fQ2k7i04N3VJvYlw2BldxWNcHublW_ygJs
author Lewin Bormann <lbo@spheniscida.de>
date Mon, 01 Feb 2016 20:35:45 +0000
parents fcb3ee662ee7
children 85e3fb6bf1b8
line wrap: on
line source

//! The MapReducer trait and associated types.

use std::collections::LinkedList;
use std::clone::Clone;
use std::hash::{Hasher, SipHasher};

/// A (key,value) pair.
pub struct Record {
    pub key: String,
    pub value: String,
}

/// A (key,[value]) pair; typicall used as input to a reducer function.
/// Can be easily iterated over, e.g. in a `for` loop.
pub struct MultiRecord {
    key: String,
    value: Box<Iterator<Item = String>>,
}

impl MultiRecord {
    /// Retrieves the key of the record.
    pub fn key<'a>(&'a self) -> &'a String {
        &self.key
    }
}

impl IntoIterator for MultiRecord {
    type Item = String;
    type IntoIter = Box<Iterator<Item=String>>;
    /// Allows iterating over all the values.
    fn into_iter(self) -> Self::IntoIter {
        self.value
    }
}

/// Emitter type used in the mapper phase; used to emit (key,value) pairs.
pub struct MEmitter {
    r: LinkedList<Record>,
}

impl MEmitter {
    pub fn new() -> MEmitter {
        MEmitter { r: LinkedList::new() }
    }
    pub fn emit(&mut self, key: String, val: String) {
        self.r.push_back(Record {
            key: key,
            value: val,
        })
    }
    pub fn _get(self) -> LinkedList<Record> {
        self.r
    }
}

/// Emitter used in the reducer phase; used to emit values.
pub struct REmitter {
    r: LinkedList<String>,
}

impl REmitter {
    pub fn new() -> REmitter {
        REmitter { r: LinkedList::new() }
    }
    pub fn emit(&mut self, val: String) {
        self.r.push_back(val)
    }
    pub fn _get(self) -> LinkedList<String> {
        self.r
    }
}

/// Default sharding function.
pub fn _std_shard(n: usize, key: &String) -> usize {
    let mut h = SipHasher::new();
    h.write(key.as_bytes());
    h.finish() as usize % n
}

/// Map() function type. The MEmitter argument is used to emit values from
/// the map() function.
pub type MapperF = fn(&mut MEmitter, Record);
/// Reduce() function type. The REmitter argument is used to emit values
/// from the reduce() function.
pub type ReducerF = fn(&mut REmitter, MultiRecord);
/// A function used to determine the shard a key belongs in.
/// The first argument is the number of shards, the second one the key;
/// the return value should be in [0; n).
pub type SharderF = fn(usize, &String) -> usize;

/// A type implementing map() and reduce() functions.
/// The MapReducer is cloned once per mapper/reducer thread.
pub trait MapReducer: Clone {
    /// Takes one <key,value> pair and an emitter.
    /// The emitter is used to yield results from the map phase.
    fn map(&self, em: &mut MEmitter, record: Record);
    /// Takes one key and one or more values and emits one or more
    /// values.
    fn reduce(&self, em: &mut REmitter, records: MultiRecord);

    /// Determines how to map keys to (reduce) shards.
    /// Returns a number in [0; n) determining the shard the key belongs in.
    /// The default implementation uses a simple hash (SipHasher) and modulo.
    fn shard(&self, n: usize, key: &String) -> usize {
        _std_shard(n, key)
    }
}