changeset 49:8a2538cbac8a

Turns out, you have to use the same sorting function everywhere for the sort to work.
author Lewin Bormann <lbo@spheniscida.de>
date Wed, 03 Feb 2016 20:50:34 +0000
parents e84f5abf7dc5
children 603c38d64cd5 a0647ae635e2 6a51cc42d1a6
files src/shard_merge.rs
diffstat 1 files changed, 53 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/src/shard_merge.rs	Wed Feb 03 20:04:25 2016 +0000
+++ b/src/shard_merge.rs	Wed Feb 03 20:50:34 2016 +0000
@@ -75,9 +75,20 @@
             comparer: None,
         }
     }
+
+    pub fn build<It: Iterator<Item = T>, ItIt: Iterator<Item = It>>(sources: &mut ItIt) -> ShardMergeIterator<'a, T>
+        where T: 'a, It: 'a {
+            ShardMergeIterator::_build(sources, None)
+    }
+    
+    pub fn build_with_cmp<It: Iterator<Item = T>, ItIt: Iterator<Item = It>>(sources: &mut ItIt, cmp: Comparer<T>) -> ShardMergeIterator<'a, T>
+        where T: 'a, It: 'a {
+            ShardMergeIterator::_build(sources, Some(cmp))
+    }
+
     /// Takes multiple iterators of type It and generates one ShardedMergeIterator..
     /// (yes, iterator over a collection of iterators).
-    pub fn build<It: Iterator<Item = T>, ItIt: Iterator<Item = It>>(sources: &mut ItIt)
+    fn _build<It: Iterator<Item = T>, ItIt: Iterator<Item = It>>(sources: &mut ItIt, cmp: Option<Comparer<T>>)
                                                                     -> ShardMergeIterator<'a, T>
         where T: 'a,
               It: 'a
@@ -96,6 +107,7 @@
                     merged.push(ShardMergeIterator {
                         left: Box::new(src1),
                         right: Box::new(iter::empty()),
+                        comparer: cmp,
                         ..ShardMergeIterator::default()
                     })
                 }
@@ -103,6 +115,7 @@
                     merged.push(ShardMergeIterator {
                         left: Box::new(src1),
                         right: Box::new(src),
+                        comparer: cmp,
                         ..ShardMergeIterator::default()
                     })
                 }
@@ -110,11 +123,7 @@
         }
 
         // Recursively build the merge tree from the leaves.
-        ShardMergeIterator::merge(merged)
-    }
-
-    pub fn set_comparer(&mut self, cmp: Comparer<T>) {
-        self.comparer = Some(cmp);
+        ShardMergeIterator::merge(merged, cmp)
     }
 
     fn compare(&self, a: &T, b: &T) -> Ordering {
@@ -126,19 +135,20 @@
 
     /// Merge multiple ShardMergeIterators, recursively (meaning it will result in a more or less
     /// balanced merge sort tree).
-    fn merge(mut its: Vec<ShardMergeIterator<'a, T>>) -> ShardMergeIterator<'a, T>
+    fn merge(mut its: Vec<ShardMergeIterator<'a, T>>, cmp: Option<Comparer<T>>) -> ShardMergeIterator<'a, T>
         where T: 'a
     {
         if its.len() == 0 {
             ShardMergeIterator::default()
         } else if its.len() == 1 {
-            ShardMergeIterator { left: Box::new(its.remove(0)), ..ShardMergeIterator::default() }
+            ShardMergeIterator { left: Box::new(its.remove(0)), comparer: cmp, ..ShardMergeIterator::default() }
         } else if its.len() == 2 {
             let it1 = its.remove(0);
             let it2 = its.remove(0);
             ShardMergeIterator {
                 left: Box::new(it1),
                 right: Box::new(it2),
+                comparer: cmp,
                 ..ShardMergeIterator::default()
             }
         } else {
@@ -146,8 +156,9 @@
             let split_at = its.len() / 2;
             let right = its.split_off(split_at);
             ShardMergeIterator {
-                left: Box::new(ShardMergeIterator::merge(its)),
-                right: Box::new(ShardMergeIterator::merge(right)),
+                left: Box::new(ShardMergeIterator::merge(its, cmp)),
+                right: Box::new(ShardMergeIterator::merge(right, cmp)),
+                comparer: cmp,
                 ..ShardMergeIterator::default()
             }
         }
@@ -173,6 +184,35 @@
     }
 }
 
+#[inline]
+fn dict_char_compare(a: char, b: char) -> Ordering {
+    use std::ascii::AsciiExt;
+    // denormalize case to lower case
+    a.to_ascii_lowercase().cmp(&b.to_ascii_lowercase())
+}
+
+/// Compares a with b in a totally case insensitive manner
+/// (like coreutil sort)
+#[inline]
+fn dict_string_compare(a: &String, b: &String) -> Ordering {
+    let (mut charsa, mut charsb) = (a.chars(), b.chars());
+    loop {
+        match (charsa.next(), charsb.next()) {
+            (None, None) => return Ordering::Equal,
+            (_, None) => return Ordering::Greater,
+            (None, _) => return Ordering::Less,
+            (Some(ca), Some(cb)) => {
+                let cmp = dict_char_compare(ca, cb);
+                if cmp != Ordering::Equal {
+                    return cmp;
+                } else {
+                    continue
+                }
+            }
+        }
+    }
+}
+
 /// Compares a with b in dictionary order (case insensitive)
 #[inline]
 fn sane_string_compare(a: &String, b: &String) -> Ordering {
@@ -247,7 +287,7 @@
     use std::io::Write;
 
     // Slow test!
-    //#[test]
+    #[test]
     fn test_merge_large_files() {
         let mut files = Vec::with_capacity(11);
 
@@ -256,8 +296,7 @@
             files.push(lines::new_from_file(&name).unwrap());
         }
 
-        let merge_it = ShardMergeIterator::build(&mut files.into_iter());
-
+        let merge_it = ShardMergeIterator::build_with_cmp(&mut files.into_iter(), dict_string_compare);
         let mut outfile = lines::LinesWriter::new_to_file(&String::from("testdata/all_sorted.txt")).unwrap();
 
         for line in merge_it {
@@ -266,7 +305,7 @@
     }
 
     use std::cmp::Ordering;
-    use super::{sane_char_compare, sane_string_compare};
+    use super::{sane_char_compare, sane_string_compare, dict_string_compare};
 
     #[test]
     fn test_sane_char_compare() {