Mercurial > lbo > hg > localmr
changeset 48:e84f5abf7dc5
Add sane string ordering function
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Wed, 03 Feb 2016 20:04:25 +0000 |
parents | d792f0788069 |
children | 8a2538cbac8a |
files | src/shard_merge.rs |
diffstat | 1 files changed, 128 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/src/shard_merge.rs Tue Feb 02 21:22:40 2016 +0000 +++ b/src/shard_merge.rs Wed Feb 03 20:04:25 2016 +0000 @@ -2,21 +2,29 @@ //! https://drive.google.com/open?id=1grB87a0w9fQ2k7i04N3VJvYlw2BldxWNcHublW_ygJs. //! Genericized in order to build arbitrary merge trees. -use std::cmp::PartialOrd; +#![allow(dead_code)] + + +use std::cmp::{Ord, Ordering}; use std::iter; +/// Function type to be used as custom compare function +/// (rust's standard String comparison is based on ASCII values, not dictionary order) +pub type Comparer<T> = fn(a: &T, b: &T) -> Ordering; + /// See module description. /// This type uses dynamic instead of static dispatch because it realizes an arbitrary structure /// and can therefore not work with a single type signature. -pub struct ShardMergeIterator<'a, T: PartialOrd> { +pub struct ShardMergeIterator<'a, T: Ord> { left: Box<Iterator<Item = T> + 'a>, right: Box<Iterator<Item = T> + 'a>, left_peeked: Option<T>, right_peeked: Option<T>, + comparer: Option<Comparer<T>>, } -impl<'a, T: PartialOrd + Clone> Iterator for ShardMergeIterator<'a, T> { +impl<'a, T: Ord + Clone> Iterator for ShardMergeIterator<'a, T> { type Item = T; fn next(&mut self) -> Option<Self::Item> { // fill up @@ -41,20 +49,21 @@ self.right_peeked = None; return r; } - (l @ Some(_), r @ Some(_)) => { - if l <= r { + (Some(l), Some(r)) => { + let cmp = self.compare(&l, &r); + if cmp == Ordering::Less || cmp == Ordering::Equal { self.left_peeked = None; - return l; + return Some(l); } else { self.right_peeked = None; - return r; + return Some(r); } } } } } -impl<'a, T: PartialOrd + Clone> ShardMergeIterator<'a, T> { +impl<'a, T: Ord + Clone> ShardMergeIterator<'a, T> { fn default() -> ShardMergeIterator<'a, T> where T: 'a { @@ -63,6 +72,7 @@ right: Box::new(iter::empty()), left_peeked: None, right_peeked: None, + comparer: None, } } /// Takes multiple iterators of type It and generates one ShardedMergeIterator.. @@ -103,6 +113,17 @@ ShardMergeIterator::merge(merged) } + pub fn set_comparer(&mut self, cmp: Comparer<T>) { + self.comparer = Some(cmp); + } + + fn compare(&self, a: &T, b: &T) -> Ordering { + match self.comparer { + None => a.cmp(b), + Some(f) => f(a, b) + } + } + /// Merge multiple ShardMergeIterators, recursively (meaning it will result in a more or less /// balanced merge sort tree). fn merge(mut its: Vec<ShardMergeIterator<'a, T>>) -> ShardMergeIterator<'a, T> @@ -133,6 +154,46 @@ } } +#[inline] +fn sane_char_compare(a: char, b: char) -> Ordering { + use std::ascii::AsciiExt; + // denormalize case to lower case + let cmp = a.to_ascii_lowercase().cmp(&b.to_ascii_lowercase()); + + // Handle the case of a and b being the same letter but with different case. + // Native: 'B' < 'a'; we want: 'B' > 'a' + if cmp == Ordering::Equal { + match a.cmp(&b) { + Ordering::Equal => Ordering::Equal, // actually same character + Ordering::Less => Ordering::Greater, // 'B' > 'a'! + Ordering::Greater => Ordering::Less, // 'a' < 'B'! + } + } else { + cmp + } +} + +/// Compares a with b in dictionary order (case insensitive) +#[inline] +fn sane_string_compare(a: &String, b: &String) -> Ordering { + let (mut charsa, mut charsb) = (a.chars(), b.chars()); + loop { + match (charsa.next(), charsb.next()) { + (None, None) => return Ordering::Equal, + (_, None) => return Ordering::Greater, + (None, _) => return Ordering::Less, + (Some(ca), Some(cb)) => { + let cmp = sane_char_compare(ca, cb); + if cmp != Ordering::Equal { + return cmp; + } else { + continue + } + } + } + } +} + #[cfg(test)] mod tests { use std::vec; @@ -185,8 +246,8 @@ use std::fmt; use std::io::Write; - use std::cmp::Ord; - + // Slow test! + //#[test] fn test_merge_large_files() { let mut files = Vec::with_capacity(11); @@ -200,7 +261,63 @@ let mut outfile = lines::LinesWriter::new_to_file(&String::from("testdata/all_sorted.txt")).unwrap(); for line in merge_it { - outfile.write(line.as_bytes()); + let _ = outfile.write(line.as_bytes()); + } + } + + use std::cmp::Ordering; + use super::{sane_char_compare, sane_string_compare}; + + #[test] + fn test_sane_char_compare() { + assert_eq!(sane_char_compare('a', 'B'), Ordering::Less); + assert_eq!(sane_char_compare('a', 'a'), Ordering::Equal); + assert_eq!(sane_char_compare('A', 'a'), Ordering::Greater); + assert_eq!(sane_char_compare('B', 'a'), Ordering::Greater); + } + + #[test] + fn test_sane_string_compare() { + let cnv = String::from; + let s1 = &cnv(""); + let s2 = &cnv("0abc"); + let s3 = &cnv("123"); + let s4 = &cnv("abc"); + let s5 = &cnv("Abc"); + let s6 = &cnv("ABC"); + let s7 = &cnv("aBC"); + + assert_eq!(sane_string_compare(s1, s2), Ordering::Less); + assert_eq!(sane_string_compare(s2, s3), Ordering::Less); + assert_eq!(sane_string_compare(s3, s2), Ordering::Greater); + assert_eq!(sane_string_compare(s2, s2), Ordering::Equal); + assert_eq!(sane_string_compare(s4, s5), Ordering::Less); + assert_eq!(sane_string_compare(s5, s6), Ordering::Less); + assert_eq!(sane_string_compare(s6, s7), Ordering::Greater); + } + + #[inline] + fn bogus_fn(o: Ordering) -> bool { + if o == Ordering::Equal { + panic!("bogus panic") + } + true + } + + // Slow test! + //#[test] + fn bench_sane_string_compare() { + let cnv = String::from; + let s1 = &cnv(""); + let s2 = &cnv("0abc"); + let s3 = &cnv("123"); + let s4 = &cnv("abc"); + let s5 = &cnv("Abc"); + + for _ in 0..50000000 { + bogus_fn(sane_string_compare(s1, s2)); + bogus_fn(sane_string_compare(s4, s3)); + bogus_fn(sane_string_compare(s4, s5)); } } }