lib/strings: Add levenshtein distance functions

Adds some functions related to string similarity: - lib.strings.commonPrefixLength - lib.strings.commonSuffixLength - lib.strings.levenshtein - lib.strings.levenshteinAtMost
2020-07-25 19:53:15 +02:00 · 2020-07-25 19:53:15 +02:00 · 975a1ae339
commit 975a1ae339
parent a40f967c85
1 changed files with 127 additions and 0 deletions
--- a/lib/strings.nix
+++ b/lib/strings.nix
@ -774,4 +774,131 @@ rec {
    (x: if stringLength x == 0 then "unknown" else x)
  ];

+  /* Computes the Levenshtein distance between two strings.
+     Complexity O(n*m) where n and m are the lengths of the strings.
+     Algorithm adjusted from https://stackoverflow.com/a/9750974/6605742
+
+     Type: levenshtein :: string -> string -> int
+
+     Example:
+       levenshtein "foo" "foo"
+       => 0
+       levenshtein "book" "hook"
+       => 1
+       levenshtein "hello" "Heyo"
+       => 3
+  */
+  levenshtein = a: b: let
+    # Two dimensional array with dimensions (stringLength a + 1, stringLength b + 1)
+    arr = lib.genList (i:
+      lib.genList (j:
+        dist i j
+      ) (stringLength b + 1)
+    ) (stringLength a + 1);
+    d = x: y: lib.elemAt (lib.elemAt arr x) y;
+    dist = i: j:
+      let c = if substring (i - 1) 1 a == substring (j - 1) 1 b
+        then 0 else 1;
+      in
+      if j == 0 then i
+      else if i == 0 then j
+      else lib.min
+        ( lib.min (d (i - 1) j + 1) (d i (j - 1) + 1))
+        ( d (i - 1) (j - 1) + c );
+  in d (stringLength a) (stringLength b);
+
+  /* Returns the length of the prefix common to both strings.
+  */
+  commonPrefixLength = a: b:
+    let
+      m = lib.min (stringLength a) (stringLength b);
+      go = i: if i >= m then m else if substring i 1 a == substring i 1 b then go (i + 1) else i;
+    in go 0;
+
+  /* Returns the length of the suffix common to both strings.
+  */
+  commonSuffixLength = a: b:
+    let
+      m = lib.min (stringLength a) (stringLength b);
+      go = i: if i >= m then m else if substring (stringLength a - i - 1) 1 a == substring (stringLength b - i - 1) 1 b then go (i + 1) else i;
+    in go 0;
+
+  /* Returns whether the levenshtein distance between two strings is at most some value
+     Complexity is O(min(n,m)) for k <= 2 and O(n*m) otherwise
+
+     Type: levenshteinAtMost :: int -> string -> string -> bool
+
+     Example:
+       levenshteinAtMost 0 "foo" "foo"
+       => true
+       levenshteinAtMost 1 "foo" "boa"
+       => false
+       levenshteinAtMost 2 "foo" "boa"
+       => true
+       levenshteinAtMost 2 "This is a sentence" "this is a sentense."
+       => false
+       levenshteinAtMost 3 "This is a sentence" "this is a sentense."
+       => true
+
+  */
+  levenshteinAtMost = let
+    infixDifferAtMost1 = x: y: stringLength x <= 1 && stringLength y <= 1;
+
+    # This function takes two strings stripped by their common pre and suffix,
+    # and returns whether they differ by at most two by Levenshtein distance.
+    # Because of this stripping, if they do indeed differ by at most two edits,
+    # we know that those edits were (if at all) done at the start or the end,
+    # while the middle has to have stayed the same. This fact is used in the
+    # implementation.
+    infixDifferAtMost2 = x: y:
+      let
+        xlen = stringLength x;
+        ylen = stringLength y;
+        # This function is only called with |x| >= |y| and |x| - |y| <= 2, so
+        # diff is one of 0, 1 or 2
+        diff = xlen - ylen;
+
+        # Infix of x and y, stripped by the left and right most character
+        xinfix = substring 1 (xlen - 2) x;
+        yinfix = substring 1 (ylen - 2) y;
+
+        # x and y but a character deleted at the left or right
+        xdelr = substring 0 (xlen - 1) x;
+        xdell = substring 1 (xlen - 1) x;
+        ydelr = substring 0 (ylen - 1) y;
+        ydell = substring 1 (ylen - 1) y;
+      in
+        # A length difference of 2 can only be gotten with 2 delete edits,
+        # which have to have happened at the start and end of x
+        # Example: "abcdef" -> "bcde"
+        if diff == 2 then xinfix == y
+        # A length difference of 1 can only be gotten with a deletion on the
+        # right and a replacement on the left or vice versa.
+        # Example: "abcdef" -> "bcdez" or "zbcde"
+        else if diff == 1 then xinfix == ydelr || xinfix == ydell
+        # No length difference can either happen through replacements on both
+        # sides, or a deletion on the left and an insertion on the right or
+        # vice versa
+        # Example: "abcdef" -> "zbcdez" or "bcdefz" or "zabcde"
+        else xinfix == yinfix || xdelr == ydell || xdell == ydelr;
+
+    in k: if k <= 0 then a: b: a == b else
+      let f = a: b:
+        let
+          alen = stringLength a;
+          blen = stringLength b;
+          prelen = commonPrefixLength a b;
+          suflen = commonSuffixLength a b;
+          presuflen = prelen + suflen;
+          ainfix = substring prelen (alen - presuflen) a;
+          binfix = substring prelen (blen - presuflen) b;
+        in
+        # Make a be the bigger string
+        if alen < blen then f b a
+        # If a has over k more characters than b, even with k deletes on a, b can't be reached
+        else if alen - blen > k then false
+        else if k == 1 then infixDifferAtMost1 ainfix binfix
+        else if k == 2 then infixDifferAtMost2 ainfix binfix
+        else levenshtein ainfix binfix <= k;
+      in f;
 }