Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.Arrays;
020
021/**
022 * An algorithm for measuring the difference between two character sequences.
023 *
024 * <p>
025 * This is the number of changes needed to change one sequence into another,
026 * where each change is a single character modification (deletion, insertion
027 * or substitution).
028 * </p>
029 *
030 * @since 1.0
031 */
032public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> {
033
034    /**
035     * Default instance.
036     */
037    private static final LevenshteinDetailedDistance DEFAULT_INSTANCE = new LevenshteinDetailedDistance();
038
039    /**
040     * Finds count for each of the three [insert, delete, substitute] operations
041     * needed. This is based on the matrix formed based on the two character
042     * sequence.
043     *
044     * @param left character sequence which need to be converted from
045     * @param right character sequence which need to be converted to
046     * @param matrix two dimensional array containing
047     * @param swapped tells whether the value for left character sequence and right
048     *            character sequence were swapped to save memory
049     * @return result object containing the count of insert, delete and substitute and total count needed
050     */
051    private static LevenshteinResults findDetailedResults(final CharSequence left,
052                                                          final CharSequence right,
053                                                          final int[][] matrix,
054                                                          final boolean swapped) {
055
056        int delCount = 0;
057        int addCount = 0;
058        int subCount = 0;
059
060        int rowIndex = right.length();
061        int columnIndex = left.length();
062
063        int dataAtLeft = 0;
064        int dataAtTop = 0;
065        int dataAtDiagonal = 0;
066        int data = 0;
067        boolean deleted = false;
068        boolean added = false;
069
070        while (rowIndex >= 0 && columnIndex >= 0) {
071
072            if (columnIndex == 0) {
073                dataAtLeft = -1;
074            } else {
075                dataAtLeft = matrix[rowIndex][columnIndex - 1];
076            }
077            if (rowIndex == 0) {
078                dataAtTop = -1;
079            } else {
080                dataAtTop = matrix[rowIndex - 1][columnIndex];
081            }
082            if (rowIndex > 0 && columnIndex > 0) {
083                dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1];
084            } else {
085                dataAtDiagonal = -1;
086            }
087            if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) {
088                break;
089            }
090            data = matrix[rowIndex][columnIndex];
091
092            // case in which the character at left and right are the same,
093            // in this case none of the counters will be incremented.
094            if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) {
095                columnIndex--;
096                rowIndex--;
097                continue;
098            }
099
100            // handling insert and delete cases.
101            deleted = false;
102            added = false;
103            if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop
104                    || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD
105                columnIndex--;
106                if (swapped) {
107                    addCount++;
108                    added = true;
109                } else {
110                    delCount++;
111                    deleted = true;
112                }
113            } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft
114                    || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD
115                rowIndex--;
116                if (swapped) {
117                    delCount++;
118                    deleted = true;
119                } else {
120                    addCount++;
121                    added = true;
122                }
123            }
124
125            // substituted case
126            if (!added && !deleted) {
127                subCount++;
128                columnIndex--;
129                rowIndex--;
130            }
131        }
132        return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount);
133    }
134
135    /**
136     * Gets the default instance.
137     *
138     * @return The default instace
139     */
140    public static LevenshteinDetailedDistance getDefaultInstance() {
141        return DEFAULT_INSTANCE;
142    }
143
144    /**
145     * Finds the Levenshtein distance between two CharSequences if it's less than or
146     * equal to a given threshold.
147     *
148     * <p>
149     * This implementation follows from Algorithms on Strings, Trees and
150     * Sequences by Dan Gusfield and Chas Emerick's implementation of the
151     * Levenshtein distance algorithm from <a
152     * href="http://www.merriampark.com/ld.htm"
153     * >http://www.merriampark.com/ld.htm</a>
154     * </p>
155     *
156     * <pre>
157     * limitedCompare(null, *, *)             = IllegalArgumentException
158     * limitedCompare(*, null, *)             = IllegalArgumentException
159     * limitedCompare(*, *, -1)               = IllegalArgumentException
160     * limitedCompare("","", 0)               = 0
161     * limitedCompare("aaapppp", "", 8)       = 7
162     * limitedCompare("aaapppp", "", 7)       = 7
163     * limitedCompare("aaapppp", "", 6))      = -1
164     * limitedCompare("elephant", "hippo", 7) = 7
165     * limitedCompare("elephant", "hippo", 6) = -1
166     * limitedCompare("hippo", "elephant", 7) = 7
167     * limitedCompare("hippo", "elephant", 6) = -1
168     * </pre>
169     *
170     * @param left the first CharSequence, must not be null
171     * @param right the second CharSequence, must not be null
172     * @param threshold the target threshold, must not be negative
173     * @return result distance, or -1
174     */
175    private static LevenshteinResults limitedCompare(CharSequence left,
176                                                     CharSequence right,
177                                                     final int threshold) { //NOPMD
178        if (left == null || right == null) {
179            throw new IllegalArgumentException("CharSequences must not be null");
180        }
181        if (threshold < 0) {
182            throw new IllegalArgumentException("Threshold must not be negative");
183        }
184
185        /*
186         * This implementation only computes the distance if it's less than or
187         * equal to the threshold value, returning -1 if it's greater. The
188         * advantage is performance: unbounded distance is O(nm), but a bound of
189         * k allows us to reduce it to O(km) time by only computing a diagonal
190         * stripe of width 2k + 1 of the cost table. It is also possible to use
191         * this to compute the unbounded Levenshtein distance by starting the
192         * threshold at 1 and doubling each time until the distance is found;
193         * this is O(dm), where d is the distance.
194         *
195         * One subtlety comes from needing to ignore entries on the border of
196         * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry
197         * to the left of the leftmost member We must ignore the entry above the
198         * rightmost member
199         *
200         * Another subtlety comes from our stripe running off the matrix if the
201         * strings aren't of the same size. Since string s is always swapped to
202         * be the shorter of the two, the stripe will always run off to the
203         * upper right instead of the lower left of the matrix.
204         *
205         * As a concrete example, suppose s is of length 5, t is of length 7,
206         * and our threshold is 1. In this case we're going to walk a stripe of
207         * length 3. The matrix would look like so:
208         *
209         * <pre>
210         *    1 2 3 4 5
211         * 1 |#|#| | | |
212         * 2 |#|#|#| | |
213         * 3 | |#|#|#| |
214         * 4 | | |#|#|#|
215         * 5 | | | |#|#|
216         * 6 | | | | |#|
217         * 7 | | | | | |
218         * </pre>
219         *
220         * Note how the stripe leads off the table as there is no possible way
221         * to turn a string of length 5 into one of length 7 in edit distance of
222         * 1.
223         *
224         * Additionally, this implementation decreases memory usage by using two
225         * single-dimensional arrays and swapping them back and forth instead of
226         * allocating an entire n by m matrix. This requires a few minor
227         * changes, such as immediately returning when it's detected that the
228         * stripe has run off the matrix and initially filling the arrays with
229         * large values so that entries we don't compute are ignored.
230         *
231         * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for
232         * some discussion.
233         */
234
235        int n = left.length(); // length of left
236        int m = right.length(); // length of right
237
238        // if one string is empty, the edit distance is necessarily the length of the other
239        if (n == 0) {
240            return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0);
241        }
242        if (m == 0) {
243            return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0);
244        }
245
246        boolean swapped = false;
247        if (n > m) {
248            // swap the two strings to consume less memory
249            final CharSequence tmp = left;
250            left = right;
251            right = tmp;
252            n = m;
253            m = right.length();
254            swapped = true;
255        }
256
257        int[] p = new int[n + 1]; // 'previous' cost array, horizontally
258        int[] d = new int[n + 1]; // cost array, horizontally
259        int[] tempD; // placeholder to assist in swapping p and d
260        final int[][] matrix = new int[m + 1][n + 1];
261
262        //filling the first row and first column values in the matrix
263        for (int index = 0; index <= n; index++) {
264            matrix[0][index] = index;
265        }
266        for (int index = 0; index <= m; index++) {
267            matrix[index][0] = index;
268        }
269
270        // fill in starting table values
271        final int boundary = Math.min(n, threshold) + 1;
272        for (int i = 0; i < boundary; i++) {
273            p[i] = i;
274        }
275        // these fills ensure that the value above the rightmost entry of our
276        // stripe will be ignored in following loop iterations
277        Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
278        Arrays.fill(d, Integer.MAX_VALUE);
279
280        // iterates through t
281        for (int j = 1; j <= m; j++) {
282            final char rightJ = right.charAt(j - 1); // jth character of right
283            d[0] = j;
284
285            // compute stripe indices, constrain to array size
286            final int min = Math.max(1, j - threshold);
287            final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(
288                    n, j + threshold);
289
290            // the stripe may lead off of the table if s and t are of different sizes
291            if (min > max) {
292                return new LevenshteinResults(-1, 0, 0, 0);
293            }
294
295            // ignore entry left of leftmost
296            if (min > 1) {
297                d[min - 1] = Integer.MAX_VALUE;
298            }
299
300            // iterates through [min, max] in s
301            for (int i = min; i <= max; i++) {
302                if (left.charAt(i - 1) == rightJ) {
303                    // diagonally left and up
304                    d[i] = p[i - 1];
305                } else {
306                    // 1 + minimum of cell to the left, to the top, diagonally left and up
307                    d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
308                }
309                matrix[j][i] = d[i];
310            }
311
312            // copy current distance counts to 'previous row' distance counts
313            tempD = p;
314            p = d;
315            d = tempD;
316        }
317
318        // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance
319        if (p[n] <= threshold) {
320            return findDetailedResults(left, right, matrix, swapped);
321        }
322        return new LevenshteinResults(-1, 0, 0, 0);
323    }
324
325    /**
326     * Finds the Levenshtein distance between two Strings.
327     *
328     * <p>A higher score indicates a greater distance.</p>
329     *
330     * <p>The previous implementation of the Levenshtein distance algorithm
331     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
332     *
333     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
334     * which can occur when my Java implementation is used with very large strings.<br>
335     * This implementation of the Levenshtein distance algorithm
336     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
337     *
338     * <pre>
339     * unlimitedCompare(null, *)             = IllegalArgumentException
340     * unlimitedCompare(*, null)             = IllegalArgumentException
341     * unlimitedCompare("","")               = 0
342     * unlimitedCompare("","a")              = 1
343     * unlimitedCompare("aaapppp", "")       = 7
344     * unlimitedCompare("frog", "fog")       = 1
345     * unlimitedCompare("fly", "ant")        = 3
346     * unlimitedCompare("elephant", "hippo") = 7
347     * unlimitedCompare("hippo", "elephant") = 7
348     * unlimitedCompare("hippo", "zzzzzzzz") = 8
349     * unlimitedCompare("hello", "hallo")    = 1
350     * </pre>
351     *
352     * @param left the first CharSequence, must not be null
353     * @param right the second CharSequence, must not be null
354     * @return result distance, or -1
355     * @throws IllegalArgumentException if either CharSequence input is {@code null}
356     */
357    private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) {
358        if (left == null || right == null) {
359            throw new IllegalArgumentException("CharSequences must not be null");
360        }
361
362        /*
363           The difference between this impl. and the previous is that, rather
364           than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
365           we maintain two single-dimensional arrays of length s.length() + 1.  The first, d,
366           is the 'current working' distance array that maintains the newest distance cost
367           counts as we iterate through the characters of String s.  Each time we increment
368           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
369           allows us to retain the previous cost counts as required by the algorithm (taking
370           the minimum of the cost count to the left, up one, and diagonally up and to the left
371           of the current cost count being calculated).  (Note that the arrays aren't really
372           copied anymore, just switched...this is clearly much better than cloning an array
373           or doing a System.arraycopy() each time  through the outer loop.)
374
375           Effectively, the difference between the two implementations is this one does not
376           cause an out of memory condition when calculating the LD over two very large strings.
377         */
378
379        int n = left.length(); // length of left
380        int m = right.length(); // length of right
381
382        if (n == 0) {
383            return new LevenshteinResults(m, m, 0, 0);
384        }
385        if (m == 0) {
386            return new LevenshteinResults(n, 0, n, 0);
387        }
388        boolean swapped = false;
389        if (n > m) {
390            // swap the input strings to consume less memory
391            final CharSequence tmp = left;
392            left = right;
393            right = tmp;
394            n = m;
395            m = right.length();
396            swapped = true;
397        }
398
399        int[] p = new int[n + 1]; // 'previous' cost array, horizontally
400        int[] d = new int[n + 1]; // cost array, horizontally
401        int[] tempD; //placeholder to assist in swapping p and d
402        final int[][] matrix = new int[m + 1][n + 1];
403
404        // filling the first row and first column values in the matrix
405        for (int index = 0; index <= n; index++) {
406            matrix[0][index] = index;
407        }
408        for (int index = 0; index <= m; index++) {
409            matrix[index][0] = index;
410        }
411
412        // indexes into strings left and right
413        int i; // iterates through left
414        int j; // iterates through right
415
416        char rightJ; // jth character of right
417
418        int cost; // cost
419        for (i = 0; i <= n; i++) {
420            p[i] = i;
421        }
422
423        for (j = 1; j <= m; j++) {
424            rightJ = right.charAt(j - 1);
425            d[0] = j;
426
427            for (i = 1; i <= n; i++) {
428                cost = left.charAt(i - 1) == rightJ ? 0 : 1;
429                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
430                d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
431                //filling the matrix
432                matrix[j][i] = d[i];
433            }
434
435            // copy current distance counts to 'previous row' distance counts
436            tempD = p;
437            p = d;
438            d = tempD;
439        }
440        return findDetailedResults(left, right, matrix, swapped);
441    }
442
443    /**
444     * Threshold.
445     */
446    private final Integer threshold;
447
448    /**
449     * <p>
450     * This returns the default instance that uses a version
451     * of the algorithm that does not use a threshold parameter.
452     * </p>
453     *
454     * @see LevenshteinDetailedDistance#getDefaultInstance()
455     */
456    public LevenshteinDetailedDistance() {
457        this(null);
458    }
459
460    /**
461     * If the threshold is not null, distance calculations will be limited to a maximum length.
462     *
463     * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p>
464     *
465     * @param threshold If this is null then distances calculations will not be limited. This may not be negative.
466     */
467    public LevenshteinDetailedDistance(final Integer threshold) {
468        if (threshold != null && threshold < 0) {
469            throw new IllegalArgumentException("Threshold must not be negative");
470        }
471        this.threshold = threshold;
472    }
473
474    /**
475     * Finds the Levenshtein distance between two Strings.
476     *
477     * <p>A higher score indicates a greater distance.</p>
478     *
479     * <p>The previous implementation of the Levenshtein distance algorithm
480     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
481     *
482     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
483     * which can occur when my Java implementation is used with very large strings.<br>
484     * This implementation of the Levenshtein distance algorithm
485     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
486     *
487     * <pre>
488     * distance.apply(null, *)             = IllegalArgumentException
489     * distance.apply(*, null)             = IllegalArgumentException
490     * distance.apply("","")               = 0
491     * distance.apply("","a")              = 1
492     * distance.apply("aaapppp", "")       = 7
493     * distance.apply("frog", "fog")       = 1
494     * distance.apply("fly", "ant")        = 3
495     * distance.apply("elephant", "hippo") = 7
496     * distance.apply("hippo", "elephant") = 7
497     * distance.apply("hippo", "zzzzzzzz") = 8
498     * distance.apply("hello", "hallo")    = 1
499     * </pre>
500     *
501     * @param left the first string, must not be null
502     * @param right the second string, must not be null
503     * @return result distance, or -1
504     * @throws IllegalArgumentException if either String input {@code null}
505     */
506    @Override
507    public LevenshteinResults apply(final CharSequence left, final CharSequence right) {
508        if (threshold != null) {
509            return limitedCompare(left, right, threshold);
510        }
511        return unlimitedCompare(left, right);
512    }
513
514    /**
515     * Gets the distance threshold.
516     *
517     * @return The distance threshold
518     */
519    public Integer getThreshold() {
520        return threshold;
521    }
522}