001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.jexl2.parser;
018    
019    /**
020     * Common constant strings utilities.
021     * <p>
022     * This package methods read JEXL string literals and handle escaping through the
023     * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
024     * and double quotes) and read Unicode hexadecimal encoded characters.
025     * </p>
026     * <p>
027     * The only escapable characters are the single and double quotes - ''' and '"' -,
028     * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
029     * the backslash character - '\' - itself.
030     * </p>
031     * <p>
032     * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
033     * sequence output being the same as the input.
034     * </p>
035     */
036    public class StringParser {
037        /** Default constructor.  */
038        public StringParser() {}
039        
040        /**
041         * Builds a string, handles escaping through '\' syntax.
042         * @param str the string to build from
043         * @param eatsep whether the separator, the first character, should be considered
044         * @return the built string
045         */
046        public static String buildString(CharSequence str, boolean eatsep) {
047            StringBuilder strb = new StringBuilder(str.length());
048            char sep = eatsep ? str.charAt(0) : 0;
049            int end = str.length() - (eatsep ? 1 : 0);
050            int begin = (eatsep ? 1 : 0);
051            read(strb, str, begin, end, sep);
052            return strb.toString();
053        }
054    
055        /**
056         * Read the remainder of a string till a given separator,
057         * handles escaping through '\' syntax.
058         * @param strb the destination buffer to copy characters into
059         * @param str the origin
060         * @param index the offset into the origin
061         * @param sep the separator, single or double quote, marking end of string
062         * @return the offset in origin
063         */
064        public static int readString(StringBuilder strb, CharSequence str, int index, char sep) {
065            return read(strb, str, index, str.length(), sep);
066        }
067    
068        /**
069         * Read the remainder of a string till a given separator,
070         * handles escaping through '\' syntax.
071         * @param strb the destination buffer to copy characters into
072         * @param str the origin
073         * @param begin the relative offset in str to begin reading
074         * @param end the relative offset in str to end reading
075         * @param sep the separator, single or double quote, marking end of string
076         * @return the last character offset handled in origin
077         */
078        private static int read(StringBuilder strb, CharSequence str, int begin, int end, char sep) {
079            boolean escape = false;
080            int index = begin;
081            for (; index < end; ++index) {
082                char c = str.charAt(index);
083                if (escape) {
084                    if (c == 'u' && (index + 4) < end && readUnicodeChar(strb, str, index + 1) > 0) {
085                        index += 4;
086                    }
087                    else {
088                        // if c is not an escapable character, re-emmit the backslash before it
089                        boolean notSeparator = sep == 0? c != '\'' && c != '"' : c != sep;
090                        if (notSeparator && c != '\\' ) {
091                            strb.append('\\');
092                        }
093                        strb.append(c);
094                    }
095                    escape = false;
096                    continue;
097                }
098                if (c == '\\') {
099                    escape = true;
100                    continue;
101                }
102                strb.append(c);
103                if (c == sep) {
104                    break;
105                }
106            }
107            return index;
108        }
109    
110        /**
111         * Reads a Unicode escape character.
112         * @param strb the builder to write the character to
113         * @param str the sequence
114         * @param begin the begin offset in sequence (after the '\\u')
115         * @return 0 if char could not be read, 4 otherwise
116         */
117        private static final int readUnicodeChar(StringBuilder strb, CharSequence str, int begin) {
118            char xc = 0;
119            int bits = 12;
120            int value = 0;
121            for(int offset = 0; offset < 4; ++offset) {
122                char c = str.charAt(begin + offset);
123                if (c >= '0' && c <= '9') {
124                    value = (c - '0');
125                }
126                else if (c >= 'a' && c <= 'h') {
127                   value = (c - 'a' + 10);
128                }
129                else if (c >= 'A' && c <= 'H') {
130                    value = (c - 'A' + 10);
131                }
132                else {
133                    return 0;
134                }
135                xc |= value << bits;
136                bits -= 4;
137            }
138            strb.append(xc);
139            return 4;
140        }
141        
142        /**
143         * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
144         * @param str the string to escape
145         * @return the escaped representation
146         */
147        public static String escapeString(String str) {
148            if (str == null) {
149                return null;
150            }
151            final int length = str.length();
152            StringBuilder strb = new StringBuilder(length + 2);
153            strb.append('\'');
154            for (int i = 0; i < length; ++i) {
155                char c = str.charAt(i);
156                if (c < 127) {
157                    if (c == '\'') {
158                        // escape quote
159                        strb.append('\\');
160                        strb.append('\'');
161                    } else if (c == '\\') {
162                        // escape backslash
163                        strb.append('\\');
164                        strb.append('\\');
165                    } else {
166                        strb.append(c);
167                    }
168                } else {
169                    // convert to Unicode escape sequence
170                    strb.append('\\');
171                    strb.append('u');
172                    String hex = Integer.toHexString(c);
173                    for (int h = hex.length(); h < 4; ++h) {
174                        strb.append('0');
175                    }
176                    strb.append(hex);
177                }
178            }
179            strb.append('\'');
180            return strb.toString();
181        }
182    }