Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.io.UnsupportedEncodingException;
020import java.util.Arrays;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.Iterator;
025import java.util.LinkedHashMap;
026import java.util.LinkedHashSet;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.Objects;
030import java.util.Set;
031
032import org.apache.commons.lang3.ArrayUtils;
033import org.apache.commons.lang3.StringUtils;
034
035/**
036 * <p>
037 * Convert from one alphabet to another, with the possibility of leaving certain
038 * characters unencoded.
039 * </p>
040 *
041 * <p>
042 * The target and do not encode languages must be in the Unicode BMP, but the
043 * source language does not.
044 * </p>
045 *
046 * <p>
047 * The encoding will all be of a fixed length, except for the 'do not encode'
048 * chars, which will be of length 1
049 * </p>
050 *
051 * <h2>Sample usage</h2>
052 *
053 * <pre>
054 * Character[] originals;   // a, b, c, d
055 * Character[] encoding;    // 0, 1, d
056 * Character[] doNotEncode; // d
057 *
058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
059 * encoding, doNotEncode);
060 *
061 * ac.encode("a");    // 00
062 * ac.encode("b");    // 01
063 * ac.encode("c");    // 0d
064 * ac.encode("d");    // d
065 * ac.encode("abcd"); // 00010dd
066 * </pre>
067 *
068 * <p>
069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
070 * change internal state.
071 * </p>
072 *
073 * @since 1.0
074 *
075 */
076public final class AlphabetConverter {
077
078    /**
079     * Arrow constant, used for converting the object into a string.
080     */
081    private static final String ARROW = " -> ";
082
083    /**
084     * Creates new String that contains just the given code point.
085     *
086     * @param i code point
087     * @return a new string with the new code point
088     * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
089     */
090    private static String codePointToString(final int i) {
091        if (Character.charCount(i) == 1) {
092            return String.valueOf((char) i);
093        }
094        return new String(Character.toChars(i));
095    }
096
097    /**
098     * Converts characters to integers.
099     *
100     * @param chars array of characters
101     * @return an equivalent array of integers
102     */
103    private static Integer[] convertCharsToIntegers(final Character[] chars) {
104        if (ArrayUtils.isEmpty(chars)) {
105            return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
106        }
107        final Integer[] integers = new Integer[chars.length];
108        for (int i = 0; i < chars.length; i++) {
109            integers[i] = (int) chars[i];
110        }
111        return integers;
112    }
113
114    /**
115     * Creates an alphabet converter, for converting from the original alphabet,
116     * to the encoded alphabet, while leaving
117     * the characters in <em>doNotEncode</em> as they are (if possible).
118     *
119     * <p>Duplicate letters in either original or encoding will be ignored.</p>
120     *
121     * @param original an array of ints representing the original alphabet in
122     *                 code points
123     * @param encoding an array of ints representing the alphabet to be used for
124     *                 encoding, in code points
125     * @param doNotEncode an array of ints representing the chars to be encoded
126     *                    using the original alphabet - every char
127     *                    here must appear in both the previous params
128     * @return The AlphabetConverter
129     * @throws IllegalArgumentException if an AlphabetConverter cannot be
130     *                                   constructed
131     */
132    public static AlphabetConverter createConverter(
133            final Integer[] original,
134            final Integer[] encoding,
135            final Integer[] doNotEncode) {
136        final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
137        final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
138        final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));
139
140        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
141        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
142        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
143
144        final int encodedLetterLength;
145
146        for (final int i : doNotEncodeCopy) {
147            if (!originalCopy.contains(i)) {
148                throw new IllegalArgumentException(
149                        "Can not use 'do not encode' list because original "
150                                + "alphabet does not contain '"
151                                + codePointToString(i) + "'");
152            }
153
154            if (!encodingCopy.contains(i)) {
155                throw new IllegalArgumentException(
156                        "Can not use 'do not encode' list because encoding alphabet does not contain '"
157                                + codePointToString(i) + "'");
158            }
159
160            doNotEncodeMap.put(i, codePointToString(i));
161        }
162
163        if (encodingCopy.size() >= originalCopy.size()) {
164            encodedLetterLength = 1;
165
166            final Iterator<Integer> it = encodingCopy.iterator();
167
168            for (final int originalLetter : originalCopy) {
169                final String originalLetterAsString =
170                        codePointToString(originalLetter);
171
172                if (doNotEncodeMap.containsKey(originalLetter)) {
173                    originalToEncoded.put(originalLetter,
174                            originalLetterAsString);
175                    encodedToOriginal.put(originalLetterAsString,
176                            originalLetterAsString);
177                } else {
178                    Integer next = it.next();
179
180                    while (doNotEncodeCopy.contains(next)) {
181                        next = it.next();
182                    }
183
184                    final String encodedLetter = codePointToString(next);
185
186                    originalToEncoded.put(originalLetter, encodedLetter);
187                    encodedToOriginal.put(encodedLetter,
188                            originalLetterAsString);
189                }
190            }
191
192            return new AlphabetConverter(originalToEncoded,
193                    encodedToOriginal,
194                    encodedLetterLength);
195
196        }
197        if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
198            throw new IllegalArgumentException(
199                    "Must have at least two encoding characters (excluding "
200                            + "those in the 'do not encode' list), but has "
201                            + (encodingCopy.size() - doNotEncodeCopy.size()));
202        }
203        // we start with one which is our minimum, and because we do the
204        // first division outside the loop
205        int lettersSoFar = 1;
206
207        // the first division takes into account that the doNotEncode
208        // letters can't be in the leftmost place
209        int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
210                / (encodingCopy.size() - doNotEncodeCopy.size());
211
212        while (lettersLeft / encodingCopy.size() >= 1) {
213            lettersLeft = lettersLeft / encodingCopy.size();
214            lettersSoFar++;
215        }
216
217        encodedLetterLength = lettersSoFar + 1;
218
219        final AlphabetConverter ac =
220                new AlphabetConverter(originalToEncoded,
221                        encodedToOriginal,
222                        encodedLetterLength);
223
224        ac.addSingleEncoding(encodedLetterLength,
225                StringUtils.EMPTY,
226                encodingCopy,
227                originalCopy.iterator(),
228                doNotEncodeMap);
229
230        return ac;
231    }
232
233    /**
234     * Creates an alphabet converter, for converting from the original alphabet,
235     * to the encoded alphabet, while leaving the characters in
236     * <em>doNotEncode</em> as they are (if possible).
237     *
238     * <p>Duplicate letters in either original or encoding will be ignored.</p>
239     *
240     * @param original an array of chars representing the original alphabet
241     * @param encoding an array of chars representing the alphabet to be used
242     *                 for encoding
243     * @param doNotEncode an array of chars to be encoded using the original
244     *                    alphabet - every char here must appear in
245     *                    both the previous params
246     * @return The AlphabetConverter
247     * @throws IllegalArgumentException if an AlphabetConverter cannot be
248     *                                  constructed
249     */
250    public static AlphabetConverter createConverterFromChars(
251            final Character[] original,
252            final Character[] encoding,
253            final Character[] doNotEncode) {
254        return AlphabetConverter.createConverter(
255                convertCharsToIntegers(original),
256                convertCharsToIntegers(encoding),
257                convertCharsToIntegers(doNotEncode));
258    }
259
260    /**
261     * Creates a new converter from a map.
262     *
263     * @param originalToEncoded a map returned from getOriginalToEncoded()
264     * @return The reconstructed AlphabetConverter
265     * @see AlphabetConverter#getOriginalToEncoded()
266     */
267    public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
268        final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
269        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
270
271        int encodedLetterLength = 1;
272
273        for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
274            final String originalAsString = codePointToString(e.getKey());
275            encodedToOriginal.put(e.getValue(), originalAsString);
276
277            if (e.getValue().length() > encodedLetterLength) {
278                encodedLetterLength = e.getValue().length();
279            }
280        }
281
282        return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
283    }
284
285    /**
286     * Original string to be encoded.
287     */
288    private final Map<Integer, String> originalToEncoded;
289
290    /**
291     * Encoding alphabet.
292     */
293    private final Map<String, String> encodedToOriginal;
294
295    /**
296     * Length of the encoded letter.
297     */
298    private final int encodedLetterLength;
299
300    /**
301     * Hidden constructor for alphabet converter. Used by static helper methods.
302     *
303     * @param originalToEncoded original string to be encoded
304     * @param encodedToOriginal encoding alphabet
305     * @param encodedLetterLength length of the encoded letter
306     */
307    private AlphabetConverter(final Map<Integer, String> originalToEncoded,
308                              final Map<String, String> encodedToOriginal,
309                              final int encodedLetterLength) {
310
311        this.originalToEncoded = originalToEncoded;
312        this.encodedToOriginal = encodedToOriginal;
313        this.encodedLetterLength = encodedLetterLength;
314    }
315
316    /**
317     * Recursive method used when creating encoder/decoder.
318     *
319     * @param level at which point it should add a single encoding
320     * @param currentEncoding current encoding
321     * @param encoding letters encoding
322     * @param originals original values
323     * @param doNotEncodeMap map of values that should not be encoded
324     */
325    private void addSingleEncoding(final int level,
326                                   final String currentEncoding,
327                                   final Collection<Integer> encoding,
328                                   final Iterator<Integer> originals,
329                                   final Map<Integer, String> doNotEncodeMap) {
330
331        if (level > 0) {
332            for (final int encodingLetter : encoding) {
333                if (!originals.hasNext()) {
334                    return; // done encoding all the original alphabet
335                }
336                // this skips the doNotEncode chars if they are in the
337                // leftmost place
338                if (level != encodedLetterLength
339                        || !doNotEncodeMap.containsKey(encodingLetter)) {
340                    addSingleEncoding(level - 1,
341                            currentEncoding
342                                    + codePointToString(encodingLetter),
343                            encoding,
344                            originals,
345                            doNotEncodeMap
346                    );
347                }
348            }
349        } else {
350            Integer next = originals.next();
351
352            while (doNotEncodeMap.containsKey(next)) {
353                final String originalLetterAsString = codePointToString(next);
354
355                originalToEncoded.put(next, originalLetterAsString);
356                encodedToOriginal.put(originalLetterAsString,
357                        originalLetterAsString);
358
359                if (!originals.hasNext()) {
360                    return;
361                }
362
363                next = originals.next();
364            }
365
366            final String originalLetterAsString = codePointToString(next);
367
368            originalToEncoded.put(next, currentEncoding);
369            encodedToOriginal.put(currentEncoding, originalLetterAsString);
370        }
371    }
372
373    /**
374     * Decodes a given string.
375     *
376     * @param encoded a string that has been encoded using this
377     *                AlphabetConverter
378     * @return The decoded string, {@code null} if the given string is null
379     * @throws UnsupportedEncodingException if unexpected characters that
380     *                                      cannot be handled are encountered
381     */
382    public String decode(final String encoded)
383            throws UnsupportedEncodingException {
384        if (encoded == null) {
385            return null;
386        }
387
388        final StringBuilder result = new StringBuilder();
389
390        for (int j = 0; j < encoded.length();) {
391            final int i = encoded.codePointAt(j);
392            final String s = codePointToString(i);
393
394            if (s.equals(originalToEncoded.get(i))) {
395                result.append(s);
396                j++; // because we do not encode in Unicode extended the
397                     // length of each encoded char is 1
398            } else {
399                if (j + encodedLetterLength > encoded.length()) {
400                    throw new UnsupportedEncodingException("Unexpected end "
401                            + "of string while decoding " + encoded);
402                }
403                final String nextGroup = encoded.substring(j,
404                        j + encodedLetterLength);
405                final String next = encodedToOriginal.get(nextGroup);
406                if (next == null) {
407                    throw new UnsupportedEncodingException(
408                            "Unexpected string without decoding ("
409                                    + nextGroup + ") in " + encoded);
410                }
411                result.append(next);
412                j += encodedLetterLength;
413            }
414        }
415
416        return result.toString();
417    }
418
419    /**
420     * Encodes a given string.
421     *
422     * @param original the string to be encoded
423     * @return The encoded string, {@code null} if the given string is null
424     * @throws UnsupportedEncodingException if chars that are not supported are
425     *                                      encountered
426     */
427    public String encode(final String original)
428            throws UnsupportedEncodingException {
429        if (original == null) {
430            return null;
431        }
432
433        final StringBuilder sb = new StringBuilder();
434
435        for (int i = 0; i < original.length();) {
436            final int codePoint = original.codePointAt(i);
437
438            final String nextLetter = originalToEncoded.get(codePoint);
439
440            if (nextLetter == null) {
441                throw new UnsupportedEncodingException(
442                        "Couldn't find encoding for '"
443                                + codePointToString(codePoint)
444                                + "' in "
445                                + original
446                );
447            }
448
449            sb.append(nextLetter);
450
451            i += Character.charCount(codePoint);
452        }
453
454        return sb.toString();
455    }
456
457    @Override
458    public boolean equals(final Object obj) {
459        if (obj == null) {
460            return false;
461        }
462        if (obj == this) {
463            return true;
464        }
465        if (!(obj instanceof AlphabetConverter)) {
466            return false;
467        }
468        final AlphabetConverter other = (AlphabetConverter) obj;
469        return originalToEncoded.equals(other.originalToEncoded)
470                && encodedToOriginal.equals(other.encodedToOriginal)
471                && encodedLetterLength == other.encodedLetterLength;
472    }
473
474    /**
475     * Gets the length of characters in the encoded alphabet that are necessary
476     * for each character in the original
477     * alphabet.
478     *
479     * @return The length of the encoded char
480     */
481    public int getEncodedCharLength() {
482        return encodedLetterLength;
483    }
484
485    /**
486     * Gets the mapping from integer code point of source language to encoded
487     * string. Use to reconstruct converter from
488     * serialized map.
489     *
490     * @return The original map
491     */
492    public Map<Integer, String> getOriginalToEncoded() {
493        return Collections.unmodifiableMap(originalToEncoded);
494    }
495
496    @Override
497    public int hashCode() {
498        return Objects.hash(originalToEncoded,
499                encodedToOriginal,
500                encodedLetterLength);
501    }
502
503    @Override
504    public String toString() {
505        final StringBuilder sb = new StringBuilder();
506        // @formatter:off
507        originalToEncoded.forEach((k, v) -> {
508            sb.append(codePointToString(k))
509              .append(ARROW)
510              .append(k)
511              .append(System.lineSeparator());
512        });
513        // @formatter:on
514        return sb.toString();
515    }
516}