001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.jexl2.parser;
018
019/**
020 * Common constant strings utilities.
021 * <p>
022 * This package methods read JEXL string literals and handle escaping through the
023 * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
024 * and double quotes) and read Unicode hexadecimal encoded characters.
025 * </p>
026 * <p>
027 * The only escapable characters are the single and double quotes - ''' and '"' -,
028 * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
029 * the backslash character - '\' - itself.
030 * </p>
031 * <p>
032 * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
033 * sequence output being the same as the input.
034 * </p>
035 */
036public class StringParser {
037    /** Default constructor.  */
038    public StringParser() {
039    }
040
041    /**
042     * Builds a string, handles escaping through '\' syntax.
043     * @param str the string to build from
044     * @param eatsep whether the separator, the first character, should be considered
045     * @return the built string
046     */
047    public static String buildString(CharSequence str, boolean eatsep) {
048        StringBuilder strb = new StringBuilder(str.length());
049        char sep = eatsep ? str.charAt(0) : 0;
050        int end = str.length() - (eatsep ? 1 : 0);
051        int begin = (eatsep ? 1 : 0);
052        read(strb, str, begin, end, sep);
053        return strb.toString();
054    }
055
056    /**
057     * Read the remainder of a string till a given separator,
058     * handles escaping through '\' syntax.
059     * @param strb the destination buffer to copy characters into
060     * @param str the origin
061     * @param index the offset into the origin
062     * @param sep the separator, single or double quote, marking end of string
063     * @return the offset in origin
064     */
065    public static int readString(StringBuilder strb, CharSequence str, int index, char sep) {
066        return read(strb, str, index, str.length(), sep);
067    }
068    /** The length of an escaped unicode sequence. */
069    private static final int UCHAR_LEN = 4;
070
071    /**
072     * Read the remainder of a string till a given separator,
073     * handles escaping through '\' syntax.
074     * @param strb the destination buffer to copy characters into
075     * @param str the origin
076     * @param begin the relative offset in str to begin reading
077     * @param end the relative offset in str to end reading
078     * @param sep the separator, single or double quote, marking end of string
079     * @return the last character offset handled in origin
080     */
081    private static int read(StringBuilder strb, CharSequence str, int begin, int end, char sep) {
082        boolean escape = false;
083        int index = begin;
084        for (; index < end; ++index) {
085            char c = str.charAt(index);
086            if (escape) {
087                if (c == 'u' && (index + UCHAR_LEN) < end && readUnicodeChar(strb, str, index + 1) > 0) {
088                    index += UCHAR_LEN;
089                } else {
090                    // if c is not an escapable character, re-emmit the backslash before it
091                    boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
092                    if (notSeparator && c != '\\') {
093                        strb.append('\\');
094                    }
095                    strb.append(c);
096                }
097                escape = false;
098                continue;
099            }
100            if (c == '\\') {
101                escape = true;
102                continue;
103            }
104            strb.append(c);
105            if (c == sep) {
106                break;
107            }
108        }
109        return index;
110    }
111    /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
112    private static final int SHIFT = 12;
113    /** The base 10 offset used to convert hexa characters to decimal. */
114    private static final int BASE10 = 10;
115
116    /**
117     * Reads a Unicode escape character.
118     * @param strb the builder to write the character to
119     * @param str the sequence
120     * @param begin the begin offset in sequence (after the '\\u')
121     * @return 0 if char could not be read, 4 otherwise
122     */
123    private static int readUnicodeChar(StringBuilder strb, CharSequence str, int begin) {
124        char xc = 0;
125        int bits = SHIFT;
126        int value = 0;
127        for (int offset = 0; offset < UCHAR_LEN; ++offset) {
128            char c = str.charAt(begin + offset);
129            if (c >= '0' && c <= '9') {
130                value = (c - '0');
131            } else if (c >= 'a' && c <= 'h') {
132                value = (c - 'a' + BASE10);
133            } else if (c >= 'A' && c <= 'H') {
134                value = (c - 'A' + BASE10);
135            } else {
136                return 0;
137            }
138            xc |= value << bits;
139            bits -= UCHAR_LEN;
140        }
141        strb.append(xc);
142        return UCHAR_LEN;
143    }
144    /** The last 7bits ascii character. */
145    private static final char LAST_ASCII = 127;
146    /** The first printable 7bits ascii character. */
147    private static final char FIRST_ASCII = 32;
148
149    /**
150     * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
151     * @param str the string to escape
152     * @return the escaped representation
153     */
154    public static String escapeString(String str, char delim) {
155        if (str == null) {
156            return null;
157        }
158        final int length = str.length();
159        StringBuilder strb = new StringBuilder(length + 2);
160        strb.append(delim);
161        for (int i = 0; i < length; ++i) {
162            char c = str.charAt(i);
163            switch (c) {
164                case 0:
165                    continue;
166                case '\b':
167                    strb.append("\\b");
168                    break;
169                case '\t':
170                    strb.append("\\t");
171                    break;
172                case '\n':
173                    strb.append("\\n");
174                    break;
175                case '\f':
176                    strb.append("\\f");
177                    break;
178                case '\r':
179                    strb.append("\\r");
180                    break;
181                case '\"':
182                    strb.append("\\\"");
183                    break;
184                case '\'':
185                    strb.append("\\\'");
186                    break;
187                case '\\':
188                    strb.append("\\\\");
189                    break;
190                default:
191                    if (c >= FIRST_ASCII && c <= LAST_ASCII) {
192                        strb.append(c);
193                    } else {
194                        // convert to Unicode escape sequence
195                        strb.append('\\');
196                        strb.append('u');
197                        String hex = Integer.toHexString(c);
198                        for (int h = hex.length(); h < UCHAR_LEN; ++h) {
199                            strb.append('0');
200                        }
201                        strb.append(hex);
202                    }
203            }
204        }
205        strb.append(delim);
206        return strb.toString();
207    }
208}