non-working optimized lcs implementation

2023-06-08 22:58:08 -04:00 · 2023-06-08 22:58:08 -04:00 · 7cc7d3bb76
parent d793297ad5
commit 7cc7d3bb76
2 changed files with 253 additions and 52 deletions
--- a/src/diff.rs
+++ b/src/diff.rs
@ -0,0 +1,195 @@
+// Based on https://github.com/mathertel/Diff
+// "An O(ND) Difference Algorithm and its Variations" by Eugene Myers Algorithmica Vol. 1 No. 2, 1986, p 251.
+
+use std::collections::HashMap;
+
+struct DiffData {
+    length: usize,
+    codes: Vec<usize>,
+    modified: Vec<bool>,
+}
+
+pub fn diff(a: &str, b: &str) {
+    let mut existing_hashes: HashMap<&str, usize> = HashMap::new();
+    let mut data_a = diff_data(a, &mut existing_hashes);
+    let mut data_b = diff_data(b, &mut existing_hashes);
+
+    let max = data_a.length + data_b.length;
+    let mut down_vector = vec![0usize; 2 * max + 2];
+    let mut up_vector = vec![0usize; 2 * max + 2];
+
+    let upper_a = data_a.length;
+    let upper_b = data_b.length;
+
+    lcs(&mut data_a, 0, upper_a, &mut data_b, 0, upper_b, &mut down_vector, &mut up_vector);
+
+    optimize(&data_a);
+    optimize(&data_b);
+
+    create_diffs(&data_a, &data_b)
+}
+
+fn diff_data<'a>(text: &'a str, existing_hashes: &mut HashMap<&'a str, usize>) -> DiffData {
+    let codes = diff_codes(text, existing_hashes);
+    let length = codes.len();
+
+    DiffData {
+        length,
+        codes,
+        modified: vec![false; length + 2],
+    }
+}
+
+fn diff_codes<'a>(text: &'a str, existing_hashes: &mut HashMap<&'a str, usize>) -> Vec<usize> {
+    let lines: Vec<&str> = text.split('\n').collect();
+
+    let mut codes = vec![0usize; lines.len()];
+    let mut next_code = existing_hashes.len() + 1;
+
+    for i in 0..lines.len() {
+        let line = lines[i];
+
+        if !existing_hashes.contains_key(line) {
+            existing_hashes.insert(line, next_code);
+            codes[i] = next_code;
+            next_code += 1;
+        } else {
+            codes[i] = existing_hashes[line];
+        }
+    }
+
+    return codes;
+}
+
+// Longest Common-Subsequence
+fn lcs(data_a: &mut DiffData, mut lower_a: usize, mut upper_a: usize, data_b: &mut DiffData, mut lower_b: usize, mut upper_b: usize, down_vector: &mut Vec<usize>, up_vector: &mut Vec<usize>) {
+    while lower_a < upper_a && lower_b < upper_b && data_a.codes[lower_a] == data_b.codes[lower_b] {
+        lower_a += 1;
+        lower_b += 1;
+    }
+
+    while lower_a < upper_a && lower_b < upper_b && data_a.codes[upper_a - 1] == data_b.codes[upper_b - 1] {
+        upper_a -= 1;
+        upper_b -= 1;
+    }
+
+    if lower_a == upper_a {
+        // Inserted lines
+        while lower_b < upper_b {
+            lower_b += 1;
+            data_b.modified[lower_b] = true;
+        }
+    } else if lower_b == upper_b {
+        // Deleted lines
+        while lower_a < upper_a {
+            lower_a += 1;
+            data_a.modified[lower_a] = true;
+        }
+    } else {
+        // Find the middle snake and length of an optimal path for A and B
+        let sms = sms(&data_a, &data_b, down_vector, up_vector);
+
+        // The path is from lower_x to (x, y) and (x, y) to upper_x
+        lcs(data_a, lower_a, sms.0, data_b, lower_b, sms.1, down_vector, up_vector);
+        lcs(data_a, sms.1, upper_a, data_b, sms.1, upper_b, down_vector, up_vector);
+    }
+}
+
+// Shortest Middle Snake
+fn sms(data_a: &DiffData, data_b: &DiffData, down_vector: &mut Vec<usize>, up_vector: &mut Vec<usize>) -> (usize, usize) {
+    let lower_a = 0usize;
+    let upper_a = data_a.length;
+    let lower_b = 0usize;
+    let upper_b = data_b.length;
+
+    let mut ret = (0usize, 0usize);
+    let max = data_a.length - data_b.length - 1;
+
+    let down_k = lower_a - lower_b;
+    let up_k = upper_a - upper_b;
+
+    let delta = (upper_a - lower_a) - (upper_b - lower_b);
+    let odd_delta = (delta & 1) != 0;
+
+    let down_offset = max - down_k;
+    let up_offset = max - up_k;
+
+    let max_d = ((upper_a - lower_a + upper_b - lower_b) / 2) + 1;
+
+    down_vector[down_offset + down_k + 1] = lower_a;
+    up_vector[up_offset + up_k - 1] = upper_a;
+
+    for d in 0..=max_d {
+        // Extend the forward path
+        for k in ((down_k - d)..=(down_k + d)).step_by(2) {
+            let mut x = 0;
+            let mut y = 0;
+            if k == down_k - d {
+                // Down
+                x = down_vector[down_offset + k + 1];
+            } else {
+                // Right
+                x = down_vector[down_offset + k - 1];
+                if k < down_k + d && down_vector[down_offset + k + 1] >= 1 {
+                    // Down
+                    x = down_vector[down_offset + k + 1];
+                }
+            }
+            y = x - k;
+
+            // Find the end of the furthest reaching forward D-path in diagonal k.
+            while x < upper_a && y < upper_b && data_a.codes[x] == data_b.codes[y] {
+                x += 1;
+                y += 1;
+            }
+
+            down_vector[down_offset + k] = x;
+
+            // Overlap ?
+            if odd_delta && up_k - d < k && k < up_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] {
+                ret.0 = down_vector[down_offset + k];
+                ret.1 = down_vector[down_offset + k] - k;
+                return ret;
+            }
+        }
+
+        // Extend the reverse path
+        for k in ((up_k - d)..=(up_k + d)).step_by(2) {
+            let mut x = 0;
+            let mut y = 0;
+
+            if k == up_k + d {
+                // Up
+                x = up_vector[up_offset + k - 1];
+            } else {
+                // Left
+                x = up_vector[up_offset + k + 1] - 1;
+                if k > up_k - d && up_vector[up_offset + k - 1] < x {
+                    // Up
+                    x = up_vector[up_offset + k - 1];
+                }
+            }
+            y = x - k;
+
+            while x > lower_a && y > lower_b && data_a.codes[x - 1] == data_b.codes[y - 1] {
+                x -= 1;
+                y -= 1;
+            }
+
+            up_vector[up_offset + k] = x;
+
+            // Overlap ?
+            if !odd_delta && down_k - d <= k && k <= down_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] {
+                ret.0 = down_vector[down_offset + k];
+                ret.1 = down_vector[down_offset + k] - k;
+                return ret;
+            }
+        }
+    }
+
+    panic!("This should not be possible :(");
+}
+
+fn optimize(data: &DiffData) {}
+
+fn create_diffs(data_a: &DiffData, data_b: &DiffData) {}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,60 +1,66 @@
-use crate::lcs::diff;
+use crate::diff::diff;

 mod matrix;
 mod lcs;
+mod diff;

 fn main() {
-    let a = "abcd";
-    let b = "abce";
+    let a = "abcabba\nlkajsdfasdf\nasdfasdfasdf\nlasjkdf";
+    let b = "abcabba\ncbabasdfasdf\nlasjkdf";

-    // diff(a, b);
-    lcs(a, b);
+    diff(a, b);
+    // lcs(a, b);
 }

-fn lcs(a: &str, b: &str) {
-    let n = a.len();
-    let m = b.len();
-    let max = (n + m) / 2;
-    let mut v = vec![0usize; max * 2];
-
-    for d in 0..max {
-        let mut k = 0usize;
-        while k <= d * 2 {
-            let mut x = if k == 0 || k != d * 2 && v[k - 1] < v[k + 1] {
-                v[k + 1]
-            } else {
-                v[k - 1] + 1
-            };
-
-            let mut y = if k < x {
-                x - k
-            } else {
-                0
-            };
-
-            while x < n && y < m {
-                let ac = a.chars().nth(x + 1).unwrap();
-                let bc = b.chars().nth(y + 1).unwrap();
-
-                if ac != bc {
-                    break;
-                }
-
-                x = x + 1;
-                y = y + 1;
-            }
-
-            v[k] = x;
-            if x >= n && y >= m {
-                println!("Length of a SES is D ({d})");
-                dbg!(v);
-                return;
-            }
-
-            k += 2;
-        }
-    }
-
-    println!("Length of a SES is greater than MAX ({max})");
-    dbg!(v);
-}
+// fn lcs(a: &str, b: &str) {
+//     let n = a.len() as i32;
+//     let m = b.len() as i32;
+//     let max = n + m;
+//     let mut endpoints = vec![0i32; max as usize * 2];
+//
+//     for script_length in 0..max {
+//         let mut k = -script_length;
+//         while k <= script_length * 2 {
+//             let index = (k + max) as usize + 1;
+//             let previous_endpoint = endpoints[index - 1];
+//             let next_endpoint = endpoints[index + 1];
+//
+//             let mut x = if k == -script_length || k != script_length && previous_endpoint < next_endpoint {
+//                 next_endpoint
+//             } else {
+//                 previous_endpoint + 1
+//             };
+//
+//             let mut y = if k < x {
+//                 x - k
+//             } else {
+//                 0
+//             };
+//
+//             // Increase x and y as long as we are in a common sequence between a and b
+//             while x < n && y < m {
+//                 let ac = a.chars().nth(x as usize).unwrap();
+//                 let bc = b.chars().nth(y as usize).unwrap();
+//
+//                 if ac != bc {
+//                     break;
+//                 }
+//
+//                 x += 1;
+//                 y += 1;
+//             }
+//
+//             endpoints[index] = x;
+//
+//             // We have traveled through both strings, the length of the shortest edit script (SES) has been found.
+//             if x >= n && y >= m {
+//                 println!("Length of a SES is D ({d})");
+//                 return;
+//             }
+//
+//             k += 2;
+//         }
+//     }
+//
+//     println!("Length of a SES is greater than MAX ({max})");
+// }