001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.escape; 016 017import static com.google.common.base.Preconditions.checkNotNull; 018 019import com.google.common.annotations.GwtCompatible; 020import com.google.errorprone.annotations.CanIgnoreReturnValue; 021import java.util.HashMap; 022import java.util.Map; 023import javax.annotation.CheckForNull; 024import org.checkerframework.checker.nullness.qual.Nullable; 025 026/** 027 * Static utility methods pertaining to {@link Escaper} instances. 028 * 029 * @author Sven Mawson 030 * @author David Beaumont 031 * @since 15.0 032 */ 033@GwtCompatible 034@ElementTypesAreNonnullByDefault 035public final class Escapers { 036 private Escapers() {} 037 038 /** 039 * Returns an {@link Escaper} that does no escaping, passing all character data through unchanged. 040 */ 041 public static Escaper nullEscaper() { 042 return NULL_ESCAPER; 043 } 044 045 // An Escaper that efficiently performs no escaping. 046 // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier. 047 private static final Escaper NULL_ESCAPER = 048 new CharEscaper() { 049 @Override 050 public String escape(String string) { 051 return checkNotNull(string); 052 } 053 054 @Override 055 @CheckForNull 056 protected char[] escape(char c) { 057 // TODO: Fix tests not to call this directly and make it throw an error. 058 return null; 059 } 060 }; 061 062 /** 063 * Returns a builder for creating simple, fast escapers. A builder instance can be reused and each 064 * escaper that is created will be a snapshot of the current builder state. Builders are not 065 * thread safe. 066 * 067 * <p>The initial state of the builder is such that: 068 * 069 * <ul> 070 * <li>There are no replacement mappings 071 * <li>{@code safeMin == Character.MIN_VALUE} 072 * <li>{@code safeMax == Character.MAX_VALUE} 073 * <li>{@code unsafeReplacement == null} 074 * </ul> 075 * 076 * <p>For performance reasons escapers created by this builder are not Unicode aware and will not 077 * validate the well-formedness of their input. 078 */ 079 public static Builder builder() { 080 return new Builder(); 081 } 082 083 /** 084 * A builder for simple, fast escapers. 085 * 086 * <p>Typically an escaper needs to deal with the escaping of high valued characters or code 087 * points. In these cases it is necessary to extend either {@link ArrayBasedCharEscaper} or {@link 088 * ArrayBasedUnicodeEscaper} to provide the desired behavior. However this builder is suitable for 089 * creating escapers that replace a relative small set of characters. 090 * 091 * @author David Beaumont 092 * @since 15.0 093 */ 094 public static final class Builder { 095 private final Map<Character, String> replacementMap = new HashMap<>(); 096 private char safeMin = Character.MIN_VALUE; 097 private char safeMax = Character.MAX_VALUE; 098 @CheckForNull private String unsafeReplacement = null; 099 100 // The constructor is exposed via the builder() method above. 101 private Builder() {} 102 103 /** 104 * Sets the safe range of characters for the escaper. Characters in this range that have no 105 * explicit replacement are considered 'safe' and remain unescaped in the output. If {@code 106 * safeMax < safeMin} then the safe range is empty. 107 * 108 * @param safeMin the lowest 'safe' character 109 * @param safeMax the highest 'safe' character 110 * @return the builder instance 111 */ 112 @CanIgnoreReturnValue 113 public Builder setSafeRange(char safeMin, char safeMax) { 114 this.safeMin = safeMin; 115 this.safeMax = safeMax; 116 return this; 117 } 118 119 /** 120 * Sets the replacement string for any characters outside the 'safe' range that have no explicit 121 * replacement. If {@code unsafeReplacement} is {@code null} then no replacement will occur, if 122 * it is {@code ""} then the unsafe characters are removed from the output. 123 * 124 * @param unsafeReplacement the string to replace unsafe characters 125 * @return the builder instance 126 */ 127 @CanIgnoreReturnValue 128 public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) { 129 this.unsafeReplacement = unsafeReplacement; 130 return this; 131 } 132 133 /** 134 * Adds a replacement string for the given input character. The specified character will be 135 * replaced by the given string whenever it occurs in the input, irrespective of whether it lies 136 * inside or outside the 'safe' range. 137 * 138 * @param c the character to be replaced 139 * @param replacement the string to replace the given character 140 * @return the builder instance 141 * @throws NullPointerException if {@code replacement} is null 142 */ 143 @CanIgnoreReturnValue 144 public Builder addEscape(char c, String replacement) { 145 checkNotNull(replacement); 146 // This can replace an existing character (the builder is re-usable). 147 replacementMap.put(c, replacement); 148 return this; 149 } 150 151 /** Returns a new escaper based on the current state of the builder. */ 152 public Escaper build() { 153 return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) { 154 @CheckForNull 155 private final char[] replacementChars = 156 unsafeReplacement != null ? unsafeReplacement.toCharArray() : null; 157 158 @Override 159 @CheckForNull 160 protected char[] escapeUnsafe(char c) { 161 return replacementChars; 162 } 163 }; 164 } 165 } 166 167 /** 168 * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance. If the escaper is 169 * already a UnicodeEscaper then it is simply returned, otherwise it is wrapped in a 170 * UnicodeEscaper. 171 * 172 * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires extra behavior with 173 * respect to the well-formedness of Unicode character sequences and will throw {@link 174 * IllegalArgumentException} when given bad input. 175 * 176 * @param escaper the instance to be wrapped 177 * @return a UnicodeEscaper with the same behavior as the given instance 178 * @throws NullPointerException if escaper is null 179 * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a CharEscaper 180 */ 181 static UnicodeEscaper asUnicodeEscaper(Escaper escaper) { 182 checkNotNull(escaper); 183 if (escaper instanceof UnicodeEscaper) { 184 return (UnicodeEscaper) escaper; 185 } else if (escaper instanceof CharEscaper) { 186 return wrap((CharEscaper) escaper); 187 } 188 // In practice this shouldn't happen because it would be very odd not to 189 // extend either CharEscaper or UnicodeEscaper for non trivial cases. 190 throw new IllegalArgumentException( 191 "Cannot create a UnicodeEscaper from: " + escaper.getClass().getName()); 192 } 193 194 /** 195 * Returns a string that would replace the given character in the specified escaper, or {@code 196 * null} if no replacement should be made. This method is intended for use in tests through the 197 * {@code EscaperAsserts} class; production users of {@link CharEscaper} should limit themselves 198 * to its public interface. 199 * 200 * @param c the character to escape if necessary 201 * @return the replacement string, or {@code null} if no escaping was needed 202 */ 203 @CheckForNull 204 public static String computeReplacement(CharEscaper escaper, char c) { 205 return stringOrNull(escaper.escape(c)); 206 } 207 208 /** 209 * Returns a string that would replace the given character in the specified escaper, or {@code 210 * null} if no replacement should be made. This method is intended for use in tests through the 211 * {@code EscaperAsserts} class; production users of {@link UnicodeEscaper} should limit 212 * themselves to its public interface. 213 * 214 * @param cp the Unicode code point to escape if necessary 215 * @return the replacement string, or {@code null} if no escaping was needed 216 */ 217 @CheckForNull 218 public static String computeReplacement(UnicodeEscaper escaper, int cp) { 219 return stringOrNull(escaper.escape(cp)); 220 } 221 222 @CheckForNull 223 private static String stringOrNull(@CheckForNull char[] in) { 224 return (in == null) ? null : new String(in); 225 } 226 227 /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */ 228 private static UnicodeEscaper wrap(CharEscaper escaper) { 229 return new UnicodeEscaper() { 230 @Override 231 @CheckForNull 232 protected char[] escape(int cp) { 233 // If a code point maps to a single character, just escape that. 234 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 235 return escaper.escape((char) cp); 236 } 237 // Convert the code point to a surrogate pair and escape them both. 238 // Note: This code path is horribly slow and typically allocates 4 new 239 // char[] each time it is invoked. However this avoids any 240 // synchronization issues and makes the escaper thread safe. 241 char[] surrogateChars = new char[2]; 242 Character.toChars(cp, surrogateChars, 0); 243 char[] hiChars = escaper.escape(surrogateChars[0]); 244 char[] loChars = escaper.escape(surrogateChars[1]); 245 246 // If either hiChars or lowChars are non-null, the CharEscaper is trying 247 // to escape the characters of a surrogate pair separately. This is 248 // uncommon and applies only to escapers that assume UCS-2 rather than 249 // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2 250 if (hiChars == null && loChars == null) { 251 // We expect this to be the common code path for most escapers. 252 return null; 253 } 254 // Combine the characters and/or escaped sequences into a single array. 255 int hiCount = hiChars != null ? hiChars.length : 1; 256 int loCount = loChars != null ? loChars.length : 1; 257 char[] output = new char[hiCount + loCount]; 258 if (hiChars != null) { 259 // TODO: Is this faster than System.arraycopy() for small arrays? 260 for (int n = 0; n < hiChars.length; ++n) { 261 output[n] = hiChars[n]; 262 } 263 } else { 264 output[0] = surrogateChars[0]; 265 } 266 if (loChars != null) { 267 for (int n = 0; n < loChars.length; ++n) { 268 output[hiCount + n] = loChars[n]; 269 } 270 } else { 271 output[hiCount] = surrogateChars[1]; 272 } 273 return output; 274 } 275 }; 276 } 277}