001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.io.UnsupportedEncodingException; 020import java.util.Arrays; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.Iterator; 025import java.util.LinkedHashMap; 026import java.util.LinkedHashSet; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.Objects; 030import java.util.Set; 031 032import org.apache.commons.lang3.ArrayUtils; 033import org.apache.commons.lang3.StringUtils; 034 035/** 036 * <p> 037 * Convert from one alphabet to another, with the possibility of leaving certain 038 * characters unencoded. 039 * </p> 040 * 041 * <p> 042 * The target and do not encode languages must be in the Unicode BMP, but the 043 * source language does not. 044 * </p> 045 * 046 * <p> 047 * The encoding will all be of a fixed length, except for the 'do not encode' 048 * chars, which will be of length 1 049 * </p> 050 * 051 * <h2>Sample usage</h2> 052 * 053 * <pre> 054 * Character[] originals; // a, b, c, d 055 * Character[] encoding; // 0, 1, d 056 * Character[] doNotEncode; // d 057 * 058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, 059 * encoding, doNotEncode); 060 * 061 * ac.encode("a"); // 00 062 * ac.encode("b"); // 01 063 * ac.encode("c"); // 0d 064 * ac.encode("d"); // d 065 * ac.encode("abcd"); // 00010dd 066 * </pre> 067 * 068 * <p> 069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not 070 * change internal state. 071 * </p> 072 * 073 * @since 1.0 074 * 075 */ 076public final class AlphabetConverter { 077 078 /** 079 * Arrow constant, used for converting the object into a string. 080 */ 081 private static final String ARROW = " -> "; 082 083 /** 084 * Creates new String that contains just the given code point. 085 * 086 * @param i code point 087 * @return a new string with the new code point 088 * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html" 089 */ 090 private static String codePointToString(final int i) { 091 if (Character.charCount(i) == 1) { 092 return String.valueOf((char) i); 093 } 094 return new String(Character.toChars(i)); 095 } 096 097 /** 098 * Converts characters to integers. 099 * 100 * @param chars array of characters 101 * @return an equivalent array of integers 102 */ 103 private static Integer[] convertCharsToIntegers(final Character[] chars) { 104 if (ArrayUtils.isEmpty(chars)) { 105 return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY; 106 } 107 final Integer[] integers = new Integer[chars.length]; 108 for (int i = 0; i < chars.length; i++) { 109 integers[i] = (int) chars[i]; 110 } 111 return integers; 112 } 113 114 /** 115 * Creates an alphabet converter, for converting from the original alphabet, 116 * to the encoded alphabet, while leaving 117 * the characters in <em>doNotEncode</em> as they are (if possible). 118 * 119 * <p>Duplicate letters in either original or encoding will be ignored.</p> 120 * 121 * @param original an array of ints representing the original alphabet in 122 * code points 123 * @param encoding an array of ints representing the alphabet to be used for 124 * encoding, in code points 125 * @param doNotEncode an array of ints representing the chars to be encoded 126 * using the original alphabet - every char 127 * here must appear in both the previous params 128 * @return The AlphabetConverter 129 * @throws IllegalArgumentException if an AlphabetConverter cannot be 130 * constructed 131 */ 132 public static AlphabetConverter createConverter( 133 final Integer[] original, 134 final Integer[] encoding, 135 final Integer[] doNotEncode) { 136 final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original)); 137 final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding)); 138 final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode)); 139 140 final Map<Integer, String> originalToEncoded = new LinkedHashMap<>(); 141 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 142 final Map<Integer, String> doNotEncodeMap = new HashMap<>(); 143 144 final int encodedLetterLength; 145 146 for (final int i : doNotEncodeCopy) { 147 if (!originalCopy.contains(i)) { 148 throw new IllegalArgumentException( 149 "Can not use 'do not encode' list because original " 150 + "alphabet does not contain '" 151 + codePointToString(i) + "'"); 152 } 153 154 if (!encodingCopy.contains(i)) { 155 throw new IllegalArgumentException( 156 "Can not use 'do not encode' list because encoding alphabet does not contain '" 157 + codePointToString(i) + "'"); 158 } 159 160 doNotEncodeMap.put(i, codePointToString(i)); 161 } 162 163 if (encodingCopy.size() >= originalCopy.size()) { 164 encodedLetterLength = 1; 165 166 final Iterator<Integer> it = encodingCopy.iterator(); 167 168 for (final int originalLetter : originalCopy) { 169 final String originalLetterAsString = 170 codePointToString(originalLetter); 171 172 if (doNotEncodeMap.containsKey(originalLetter)) { 173 originalToEncoded.put(originalLetter, 174 originalLetterAsString); 175 encodedToOriginal.put(originalLetterAsString, 176 originalLetterAsString); 177 } else { 178 Integer next = it.next(); 179 180 while (doNotEncodeCopy.contains(next)) { 181 next = it.next(); 182 } 183 184 final String encodedLetter = codePointToString(next); 185 186 originalToEncoded.put(originalLetter, encodedLetter); 187 encodedToOriginal.put(encodedLetter, 188 originalLetterAsString); 189 } 190 } 191 192 return new AlphabetConverter(originalToEncoded, 193 encodedToOriginal, 194 encodedLetterLength); 195 196 } 197 if (encodingCopy.size() - doNotEncodeCopy.size() < 2) { 198 throw new IllegalArgumentException( 199 "Must have at least two encoding characters (excluding " 200 + "those in the 'do not encode' list), but has " 201 + (encodingCopy.size() - doNotEncodeCopy.size())); 202 } 203 // we start with one which is our minimum, and because we do the 204 // first division outside the loop 205 int lettersSoFar = 1; 206 207 // the first division takes into account that the doNotEncode 208 // letters can't be in the leftmost place 209 int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size()) 210 / (encodingCopy.size() - doNotEncodeCopy.size()); 211 212 while (lettersLeft / encodingCopy.size() >= 1) { 213 lettersLeft = lettersLeft / encodingCopy.size(); 214 lettersSoFar++; 215 } 216 217 encodedLetterLength = lettersSoFar + 1; 218 219 final AlphabetConverter ac = 220 new AlphabetConverter(originalToEncoded, 221 encodedToOriginal, 222 encodedLetterLength); 223 224 ac.addSingleEncoding(encodedLetterLength, 225 StringUtils.EMPTY, 226 encodingCopy, 227 originalCopy.iterator(), 228 doNotEncodeMap); 229 230 return ac; 231 } 232 233 /** 234 * Creates an alphabet converter, for converting from the original alphabet, 235 * to the encoded alphabet, while leaving the characters in 236 * <em>doNotEncode</em> as they are (if possible). 237 * 238 * <p>Duplicate letters in either original or encoding will be ignored.</p> 239 * 240 * @param original an array of chars representing the original alphabet 241 * @param encoding an array of chars representing the alphabet to be used 242 * for encoding 243 * @param doNotEncode an array of chars to be encoded using the original 244 * alphabet - every char here must appear in 245 * both the previous params 246 * @return The AlphabetConverter 247 * @throws IllegalArgumentException if an AlphabetConverter cannot be 248 * constructed 249 */ 250 public static AlphabetConverter createConverterFromChars( 251 final Character[] original, 252 final Character[] encoding, 253 final Character[] doNotEncode) { 254 return AlphabetConverter.createConverter( 255 convertCharsToIntegers(original), 256 convertCharsToIntegers(encoding), 257 convertCharsToIntegers(doNotEncode)); 258 } 259 260 /** 261 * Creates a new converter from a map. 262 * 263 * @param originalToEncoded a map returned from getOriginalToEncoded() 264 * @return The reconstructed AlphabetConverter 265 * @see AlphabetConverter#getOriginalToEncoded() 266 */ 267 public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) { 268 final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded); 269 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 270 271 int encodedLetterLength = 1; 272 273 for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) { 274 final String originalAsString = codePointToString(e.getKey()); 275 encodedToOriginal.put(e.getValue(), originalAsString); 276 277 if (e.getValue().length() > encodedLetterLength) { 278 encodedLetterLength = e.getValue().length(); 279 } 280 } 281 282 return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength); 283 } 284 285 /** 286 * Original string to be encoded. 287 */ 288 private final Map<Integer, String> originalToEncoded; 289 290 /** 291 * Encoding alphabet. 292 */ 293 private final Map<String, String> encodedToOriginal; 294 295 /** 296 * Length of the encoded letter. 297 */ 298 private final int encodedLetterLength; 299 300 /** 301 * Hidden constructor for alphabet converter. Used by static helper methods. 302 * 303 * @param originalToEncoded original string to be encoded 304 * @param encodedToOriginal encoding alphabet 305 * @param encodedLetterLength length of the encoded letter 306 */ 307 private AlphabetConverter(final Map<Integer, String> originalToEncoded, 308 final Map<String, String> encodedToOriginal, 309 final int encodedLetterLength) { 310 311 this.originalToEncoded = originalToEncoded; 312 this.encodedToOriginal = encodedToOriginal; 313 this.encodedLetterLength = encodedLetterLength; 314 } 315 316 /** 317 * Recursive method used when creating encoder/decoder. 318 * 319 * @param level at which point it should add a single encoding 320 * @param currentEncoding current encoding 321 * @param encoding letters encoding 322 * @param originals original values 323 * @param doNotEncodeMap map of values that should not be encoded 324 */ 325 private void addSingleEncoding(final int level, 326 final String currentEncoding, 327 final Collection<Integer> encoding, 328 final Iterator<Integer> originals, 329 final Map<Integer, String> doNotEncodeMap) { 330 331 if (level > 0) { 332 for (final int encodingLetter : encoding) { 333 if (!originals.hasNext()) { 334 return; // done encoding all the original alphabet 335 } 336 // this skips the doNotEncode chars if they are in the 337 // leftmost place 338 if (level != encodedLetterLength 339 || !doNotEncodeMap.containsKey(encodingLetter)) { 340 addSingleEncoding(level - 1, 341 currentEncoding 342 + codePointToString(encodingLetter), 343 encoding, 344 originals, 345 doNotEncodeMap 346 ); 347 } 348 } 349 } else { 350 Integer next = originals.next(); 351 352 while (doNotEncodeMap.containsKey(next)) { 353 final String originalLetterAsString = codePointToString(next); 354 355 originalToEncoded.put(next, originalLetterAsString); 356 encodedToOriginal.put(originalLetterAsString, 357 originalLetterAsString); 358 359 if (!originals.hasNext()) { 360 return; 361 } 362 363 next = originals.next(); 364 } 365 366 final String originalLetterAsString = codePointToString(next); 367 368 originalToEncoded.put(next, currentEncoding); 369 encodedToOriginal.put(currentEncoding, originalLetterAsString); 370 } 371 } 372 373 /** 374 * Decodes a given string. 375 * 376 * @param encoded a string that has been encoded using this 377 * AlphabetConverter 378 * @return The decoded string, {@code null} if the given string is null 379 * @throws UnsupportedEncodingException if unexpected characters that 380 * cannot be handled are encountered 381 */ 382 public String decode(final String encoded) 383 throws UnsupportedEncodingException { 384 if (encoded == null) { 385 return null; 386 } 387 388 final StringBuilder result = new StringBuilder(); 389 390 for (int j = 0; j < encoded.length();) { 391 final int i = encoded.codePointAt(j); 392 final String s = codePointToString(i); 393 394 if (s.equals(originalToEncoded.get(i))) { 395 result.append(s); 396 j++; // because we do not encode in Unicode extended the 397 // length of each encoded char is 1 398 } else { 399 if (j + encodedLetterLength > encoded.length()) { 400 throw new UnsupportedEncodingException("Unexpected end " 401 + "of string while decoding " + encoded); 402 } 403 final String nextGroup = encoded.substring(j, 404 j + encodedLetterLength); 405 final String next = encodedToOriginal.get(nextGroup); 406 if (next == null) { 407 throw new UnsupportedEncodingException( 408 "Unexpected string without decoding (" 409 + nextGroup + ") in " + encoded); 410 } 411 result.append(next); 412 j += encodedLetterLength; 413 } 414 } 415 416 return result.toString(); 417 } 418 419 /** 420 * Encodes a given string. 421 * 422 * @param original the string to be encoded 423 * @return The encoded string, {@code null} if the given string is null 424 * @throws UnsupportedEncodingException if chars that are not supported are 425 * encountered 426 */ 427 public String encode(final String original) 428 throws UnsupportedEncodingException { 429 if (original == null) { 430 return null; 431 } 432 433 final StringBuilder sb = new StringBuilder(); 434 435 for (int i = 0; i < original.length();) { 436 final int codePoint = original.codePointAt(i); 437 438 final String nextLetter = originalToEncoded.get(codePoint); 439 440 if (nextLetter == null) { 441 throw new UnsupportedEncodingException( 442 "Couldn't find encoding for '" 443 + codePointToString(codePoint) 444 + "' in " 445 + original 446 ); 447 } 448 449 sb.append(nextLetter); 450 451 i += Character.charCount(codePoint); 452 } 453 454 return sb.toString(); 455 } 456 457 @Override 458 public boolean equals(final Object obj) { 459 if (obj == null) { 460 return false; 461 } 462 if (obj == this) { 463 return true; 464 } 465 if (!(obj instanceof AlphabetConverter)) { 466 return false; 467 } 468 final AlphabetConverter other = (AlphabetConverter) obj; 469 return originalToEncoded.equals(other.originalToEncoded) 470 && encodedToOriginal.equals(other.encodedToOriginal) 471 && encodedLetterLength == other.encodedLetterLength; 472 } 473 474 /** 475 * Gets the length of characters in the encoded alphabet that are necessary 476 * for each character in the original 477 * alphabet. 478 * 479 * @return The length of the encoded char 480 */ 481 public int getEncodedCharLength() { 482 return encodedLetterLength; 483 } 484 485 /** 486 * Gets the mapping from integer code point of source language to encoded 487 * string. Use to reconstruct converter from 488 * serialized map. 489 * 490 * @return The original map 491 */ 492 public Map<Integer, String> getOriginalToEncoded() { 493 return Collections.unmodifiableMap(originalToEncoded); 494 } 495 496 @Override 497 public int hashCode() { 498 return Objects.hash(originalToEncoded, 499 encodedToOriginal, 500 encodedLetterLength); 501 } 502 503 @Override 504 public String toString() { 505 final StringBuilder sb = new StringBuilder(); 506 // @formatter:off 507 originalToEncoded.forEach((k, v) -> { 508 sb.append(codePointToString(k)) 509 .append(ARROW) 510 .append(k) 511 .append(System.lineSeparator()); 512 }); 513 // @formatter:on 514 return sb.toString(); 515 } 516}