001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.diff;
018
019/**
020 * <p>
021 * It is guaranteed that the comparisons will always be done as
022 * {@code o1.equals(o2)} where {@code o1} belongs to the first
023 * sequence and {@code o2} belongs to the second sequence. This can
024 * be important if subclassing is used for some elements in the first
025 * sequence and the {@code equals} method is specialized.
026 * </p>
027 * <p>
028 * Comparison can be seen from two points of view: either as giving the smallest
029 * modification allowing to transform the first sequence into the second one, or
030 * as giving the longest sequence which is a subsequence of both initial
031 * sequences. The {@code equals} method is used to compare objects, so any
032 * object can be put into sequences. Modifications include deleting, inserting
033 * or keeping one object, starting from the beginning of the first sequence.
034 * </p>
035 * <p>
036 * This class implements the comparison algorithm, which is the very efficient
037 * algorithm from Eugene W. Myers
038 * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
039 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
040 * the shortest possible {@link EditScript edit script} containing all the
041 * {@link EditCommand commands} needed to transform the first sequence into
042 * the second one.
043 *
044 * <p>
045 * This code has been adapted from Apache Commons Collections 4.0.
046 * </p>
047 *
048 * @see EditScript
049 * @see EditCommand
050 * @see CommandVisitor
051 * @since 1.0
052 */
053public class StringsComparator {
054
055    /**
056     * This class is a simple placeholder to hold the end part of a path
057     * under construction in a {@link StringsComparator StringsComparator}.
058     */
059    private static class Snake {
060
061        /** Start index. */
062        private final int start;
063
064        /** End index. */
065        private final int end;
066
067        /** Diagonal number. */
068        private final int diag;
069
070        /**
071         * Constructs a new instance of Snake with specified indices.
072         *
073         * @param start  start index of the snake
074         * @param end  end index of the snake
075         * @param diag  diagonal number
076         */
077        Snake(final int start, final int end, final int diag) {
078            this.start = start;
079            this.end   = end;
080            this.diag  = diag;
081        }
082
083        /**
084         * Gets the diagonal number of the snake.
085         *
086         * @return diagonal number of the snake
087         */
088        public int getDiag() {
089            return diag;
090        }
091
092        /**
093         * Gets the end index of the snake.
094         *
095         * @return end index of the snake
096         */
097        public int getEnd() {
098            return end;
099        }
100
101        /**
102         * Gets the start index of the snake.
103         *
104         * @return start index of the snake
105         */
106        public int getStart() {
107            return start;
108        }
109    }
110    /**
111     * First character sequence.
112     */
113    private final String left;
114    /**
115     * Second character sequence.
116     */
117    private final String right;
118    /**
119     * Temporary array.
120     */
121    private final int[] vDown;
122
123    /**
124     * Temporary array.
125     */
126    private final int[] vUp;
127
128    /**
129     * Constructs a new instance of StringsComparator.
130     * <p>
131     * It is <em>guaranteed</em> that the comparisons will always be done as
132     * {@code o1.equals(o2)} where {@code o1} belongs to the first
133     * sequence and {@code o2} belongs to the second sequence. This can be
134     * important if subclassing is used for some elements in the first sequence
135     * and the {@code equals} method is specialized.
136     * </p>
137     *
138     * @param left first character sequence to be compared
139     * @param right second character sequence to be compared
140     */
141    public StringsComparator(final String left, final String right) {
142        this.left = left;
143        this.right = right;
144
145        final int size = left.length() + right.length() + 2;
146        vDown = new int[size];
147        vUp   = new int[size];
148    }
149
150    /**
151     * Builds an edit script.
152     *
153     * @param start1  the begin of the first sequence to be compared
154     * @param end1  the end of the first sequence to be compared
155     * @param start2  the begin of the second sequence to be compared
156     * @param end2  the end of the second sequence to be compared
157     * @param script the edited script
158     */
159    private void buildScript(final int start1, final int end1, final int start2, final int end2,
160            final EditScript<Character> script) {
161        final Snake middle = getMiddleSnake(start1, end1, start2, end2);
162
163        if (middle == null
164                || middle.getStart() == end1 && middle.getDiag() == end1 - end2
165                || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
166
167            int i = start1;
168            int j = start2;
169            while (i < end1 || j < end2) {
170                if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
171                    script.append(new KeepCommand<>(left.charAt(i)));
172                    ++i;
173                    ++j;
174                } else {
175                    if (end1 - start1 > end2 - start2) {
176                        script.append(new DeleteCommand<>(left.charAt(i)));
177                        ++i;
178                    } else {
179                        script.append(new InsertCommand<>(right.charAt(j)));
180                        ++j;
181                    }
182                }
183            }
184
185        } else {
186
187            buildScript(start1, middle.getStart(),
188                        start2, middle.getStart() - middle.getDiag(),
189                        script);
190            for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
191                script.append(new KeepCommand<>(left.charAt(i)));
192            }
193            buildScript(middle.getEnd(), end1,
194                        middle.getEnd() - middle.getDiag(), end2,
195                        script);
196        }
197    }
198
199    /**
200     * Builds a snake.
201     *
202     * @param start  the value of the start of the snake
203     * @param diag  the value of the diagonal of the snake
204     * @param end1  the value of the end of the first sequence to be compared
205     * @param end2  the value of the end of the second sequence to be compared
206     * @return The snake built
207     */
208    private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
209        int end = start;
210        while (end - diag < end2
211                && end < end1
212                && left.charAt(end) == right.charAt(end - diag)) {
213            ++end;
214        }
215        return new Snake(start, end, diag);
216    }
217
218    /**
219     * Gets the middle snake corresponding to two subsequences of the
220     * main sequences.
221     * <p>
222     * The snake is found using the MYERS Algorithm (this algorithms has
223     * also been implemented in the GNU diff program). This algorithm is
224     * explained in Eugene Myers article:
225     * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
226     * An O(ND) Difference Algorithm and Its Variations</a>.
227     * </p>
228     *
229     * @param start1  the begin of the first sequence to be compared
230     * @param end1  the end of the first sequence to be compared
231     * @param start2  the begin of the second sequence to be compared
232     * @param end2  the end of the second sequence to be compared
233     * @return The middle snake
234     */
235    private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
236        // Myers Algorithm
237        // Initialisations
238        final int m = end1 - start1;
239        final int n = end2 - start2;
240        if (m == 0 || n == 0) {
241            return null;
242        }
243
244        final int delta  = m - n;
245        final int sum    = n + m;
246        final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
247        vDown[1 + offset] = start1;
248        vUp[1 + offset]   = end1 + 1;
249
250        for (int d = 0; d <= offset; ++d) {
251            // Down
252            for (int k = -d; k <= d; k += 2) {
253                // First step
254
255                final int i = k + offset;
256                if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
257                    vDown[i] = vDown[i + 1];
258                } else {
259                    vDown[i] = vDown[i - 1] + 1;
260                }
261
262                int x = vDown[i];
263                int y = x - start1 + start2 - k;
264
265                while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
266                    vDown[i] = ++x;
267                    ++y;
268                }
269                // Second step
270                if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
271                    if (vUp[i - delta] <= vDown[i]) { // NOPMD
272                        return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
273                    }
274                }
275            }
276
277            // Up
278            for (int k = delta - d; k <= delta + d; k += 2) {
279                // First step
280                final int i = k + offset - delta;
281                if (k == delta - d
282                        || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
283                    vUp[i] = vUp[i + 1] - 1;
284                } else {
285                    vUp[i] = vUp[i - 1];
286                }
287
288                int x = vUp[i] - 1;
289                int y = x - start1 + start2 - k;
290                while (x >= start1 && y >= start2
291                        && left.charAt(x) == right.charAt(y)) {
292                    vUp[i] = x--;
293                    y--;
294                }
295                // Second step
296                if (delta % 2 == 0 && -d <= k && k <= d) {
297                    if (vUp[i] <= vDown[i + delta]) { // NOPMD
298                        return buildSnake(vUp[i], k + start1 - start2, end1, end2);
299                    }
300                }
301            }
302        }
303
304        // this should not happen
305        throw new IllegalStateException("Internal Error");
306    }
307
308    /**
309     * Gets the {@link EditScript} object.
310     * <p>
311     * It is guaranteed that the objects embedded in the {@link InsertCommand
312     * insert commands} come from the second sequence and that the objects
313     * embedded in either the {@link DeleteCommand delete commands} or
314     * {@link KeepCommand keep commands} come from the first sequence. This can
315     * be important if subclassing is used for some elements in the first
316     * sequence and the {@code equals} method is specialized.
317     * </p>
318     *
319     * @return The edit script resulting from the comparison of the two
320     *         sequences
321     */
322    public EditScript<Character> getScript() {
323        final EditScript<Character> script = new EditScript<>();
324        buildScript(0, left.length(), 0, right.length(), script);
325        return script;
326    }
327
328}