View Javadoc

1   /*
2    * This file is part of Domingo
3    * an Open Source Java-API to Lotus Notes/Domino
4    * hosted at http://domingo.sourceforge.net
5    *
6    * Copyright (c) 2003-2007 Beck et al. projects GmbH Munich, Germany (http://www.bea.de)
7    *
8    * This library is free software; you can redistribute it and/or
9    * modify it under the terms of the GNU Lesser General Public
10   * License as published by the Free Software Foundation; either
11   * version 2.1 of the License, or (at your option) any later version.
12   *
13   * This library is distributed in the hope that it will be useful,
14   * but WITHOUT ANY WARRANTY; without even the implied warranty of
15   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   * Lesser General Public License for more details.
17   *
18   * You should have received a copy of the GNU Lesser General Public
19   * License along with this library; if not, write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21   */
22  package de.bea.domingo.util;
23  
24  import java.io.UnsupportedEncodingException;
25  import java.util.Properties;
26  
27  /***
28   * @author <a href="mailto:kriede@users.sourceforge.net">Kurt Riede</a>
29   */
30  public final class XMLUtil {
31  
32      private static final String XML_HEADER_SAMPLE = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"      ";
33  
34      private static final String XML_START = "<?xml";
35  
36      private static final String XML_END = "?>";
37  
38      private XMLUtil() {
39      }
40  
41      /***
42       * Parses a byte array that represents an XML instance document to a String
43       * with respect to encoding.
44       *
45       * @param bytes the bytes to parse
46       * @return parsed string
47       */
48      public static String parse(final byte[] bytes) {
49          final byte[] b4 = new byte[XML_START.length() - 1];
50          int count = 0;
51          for (; count < b4.length; count++) {
52              b4[count] = (byte) bytes[count];
53          }
54          String encoding = (String) IANAEncoding.getEncodingName(b4, count)[0];
55          String preparsedResult = null;
56          try {
57              int min = Math.min(bytes.length, 2 * XML_HEADER_SAMPLE.length());
58              preparsedResult = new String(bytes, 0, min, encoding);
59          } catch (UnsupportedEncodingException e) {
60              try {
61                  preparsedResult = new String(bytes, "UTF-8");
62              } catch (UnsupportedEncodingException e1) {
63                  preparsedResult = new String(bytes);
64              }
65          }
66          XMLHeaderParser xmlHeaderParser = new XMLHeaderParser(preparsedResult);
67          try {
68              String xmlEncoding = xmlHeaderParser.getEncoding();
69              return new String(bytes, xmlEncoding);
70          } catch (UnsupportedEncodingException e) {
71          }
72          try {
73              return new String(bytes, "UTF-8");
74          } catch (UnsupportedEncodingException e) {
75              return new String(bytes);
76          }
77      }
78  
79      private static final class XMLHeaderParser {
80  
81          /*** BOM for encoding in big endian. */
82          private static final int BOM_BIG_ENDIAN = 0xfffe;
83  
84          /*** BOM for encoding in little endian. */
85          private static final int BOM_LITTLE_ENDIAN = 0xfeff;
86  
87          private Properties fPseudoAttributes = new Properties();
88  
89          /*** Position in preparsed string. */
90          private int fPosition = 0;
91  
92          /*** byte array of input to parse. */
93          private final String fPreparsedString;
94  
95          /***
96           * Constructor.
97           */
98          public XMLHeaderParser(final String preparsedString) {
99              fPreparsedString = preparsedString;
100         }
101 
102         /***
103          * @param preparsedResult
104          * @return
105          * @throws XmlHeaderParserException if the XML header is inavlid
106          */
107         private String getEncoding() throws UnsupportedEncodingException {
108             skipBOM();
109             skip(XML_START);
110             parsePseudoAttribute();
111             parsePseudoAttribute();
112             parsePseudoAttribute();
113             skip(XML_END);
114             return (String) fPseudoAttributes.get("encoding");
115         }
116 
117         /***
118          * Parses and skips the optional byte order mask (BOM).
119          */
120         private void skipBOM() {
121              char c = fPreparsedString.charAt(0);
122              if (c == BOM_LITTLE_ENDIAN || c == BOM_BIG_ENDIAN) {
123                  fPosition++;
124              }
125         }
126 
127         /***
128          * Parses a pseudo attribute as used in the XML declaration.
129          *
130          * @return String representing the pseudo attribute
131          * @throws XmlHeaderParserException if the XML header is inavlid
132          */
133         private void parsePseudoAttribute() throws UnsupportedEncodingException {
134             skipWhiteSpace();
135             if (fPreparsedString.charAt(fPosition) == '?') {
136                 return;
137             }
138             String name = fPreparsedString.substring(fPosition, fPreparsedString.indexOf('=', fPosition));
139             fPosition += name.length() + 1;
140             skipQuote();
141             String value = fPreparsedString.substring(fPosition, fPreparsedString.indexOf('\"', fPosition));
142             fPosition += value.length();
143             skipQuote();
144             fPseudoAttributes.put(name, value);
145         }
146 
147         /***
148          * @throws XmlHeaderParserException if there is no quote at the current position
149          */
150         private void skipQuote() throws UnsupportedEncodingException {
151             if (fPreparsedString.charAt(fPosition) == '\"') {
152                 fPosition++;
153                 return;
154             }
155             throw new UnsupportedEncodingException("expected quote at character " + fPosition);
156         }
157 
158         /***
159          *
160          */
161         private void skipWhiteSpace() {
162             while (isWhitespace(fPreparsedString.charAt(fPosition))) {
163                 fPosition++;
164             }
165         }
166 
167         /***
168          * Checks if a given character is a white-space character.
169          *
170          * @param c the charcter to check
171          * @return <code>true</code> if the character is a white-space, else <code>false</code>
172          */
173         private boolean isWhitespace(final char c) {
174             return c == ' ' || c == '\t';
175         }
176 
177         /***
178          * Parses a constant string.
179          *
180          * @param s the string to parse
181          * @throws XmlHeaderParserException if the expected string wasn't found
182          */
183         private void skip(final String s) throws UnsupportedEncodingException {
184             if (fPreparsedString.length() - fPosition < s.length()) {
185                 throw new UnsupportedEncodingException("expected " + s + " at character " + fPosition);
186             }
187             String substring = fPreparsedString.substring(fPosition, fPosition + s.length());
188             if (substring.equals(s)) {
189                 fPosition += s.length();
190                 return;
191             }
192             throw new UnsupportedEncodingException("expected " + s + " at character " + fPosition);
193         }
194     }
195 
196     private static final class Encoding {
197 
198         /*** Encoding name: UTF-8. */
199         private static final String UTF_8 = "UTF-8";
200 
201         /*** Encoding name: UTF-16 with big endian. */
202         private static final String UTF_16_BIG_ENDIAN = "UTF-16BE";
203 
204         /*** Encoding name: UTF-16 with little endian. */
205         private static final String UTF_16_LITTLE_ENDIAN = "UTF-16LE";
206 
207         /*** Encoding name: ISO-10646-UCS-4. */
208         private static final String ISO_10646_UCS_4 = "ISO-10646-UCS-4";
209 
210         /*** Encoding name: CP037 (EBCDIC). */
211         private static final String CP037 = "CP037";
212     }
213 
214     private static final class EBCDIC {
215 
216         /*** EBCDIC character less-than ("<"). */
217         private static final int LESS_THAN = 0x4C;
218 
219         /*** EBCDIC character questionmark ("?"). */
220         private static final int QUESTIONMARK = 0x6F;
221 
222         /*** EBCDIC lower case character x. */
223         private static final int LOWER_X = 0xA7;
224 
225         /*** EBCDIC lower case character m. */
226         private static final int LOWER_M = 0x94;
227     }
228 
229     private static final class ASCII {
230 
231         /*** ASCII character less-than ("<"). */
232         private static final int LESS_THAN = 0x3C;
233 
234         /*** ASCII character questionmark ("?"). */
235         private static final int QUESTIONMARK = 0x3F;
236     }
237 
238     private static final class IANAEncoding {
239 
240         /*** Bit mask: <tt>10111011</tt>, character.*/
241         private static final int MASK_10111011 = 0xBB;
242 
243         /*** Bit mask: <tt>10111111</tt>, character.*/
244         private static final int MASK_10111111 = 0xBF;
245 
246         /*** Bit mask: <tt>11111111</tt>.*/
247         private static final int MASK_11111111 = 0xFF;
248 
249         /*** Bit mask: <tt>11111110</tt>.*/
250         private static final int MASK_11111110 = 0xFE;
251 
252         /*** Bit mask: <tt>11101111</tt>.*/
253         private static final int MASK_11101111 = 0xEF;
254 
255         private IANAEncoding() {
256         }
257 
258         /***
259          * Returns the IANA encoding name that is auto-detected from the bytes
260          * specified, with the endian-ness of that encoding where appropriate.
261          *
262          * @param b4 The first four bytes of the input.
263          * @param count The number of bytes actually read.
264          * @return a 2-element array: the first element, an IANA-encoding string,
265          *         the second element a Boolean which is true iff the document is
266          *         big endian, false if it's little-endian, and null if the
267          *         distinction isn't relevant.
268          */
269         private static Object[] getEncodingName(final byte[] b4, final int count) {
270             if (count < 2) {
271                 return new Object[] { Encoding.UTF_8, null };
272             }
273             // UTF-16, with BOM
274             int b0 = b4[0] & MASK_11111111;
275             int b1 = b4[1] & MASK_11111111;
276             if (b0 == MASK_11111110 && b1 == MASK_11111111) {
277                 return new Object[] { Encoding.UTF_16_BIG_ENDIAN, Boolean.TRUE };
278             }
279             if (b0 == MASK_11111111 && b1 == MASK_11111110) {
280                 return new Object[] { Encoding.UTF_16_LITTLE_ENDIAN, Boolean.FALSE };
281             }
282             // default to UTF-8 if we don't have enough bytes to make a good determination of the encoding
283             if (count < 3) {
284                 return new Object[] { Encoding.UTF_8, null };
285             }
286             // UTF-8 with a BOM
287             int b2 = b4[2] & MASK_11111111;
288             if (b0 == MASK_11101111 && b1 == MASK_10111011 && b2 == MASK_10111111) {
289                 return new Object[] { Encoding.UTF_8, null };
290             }
291             // default to UTF-8 if we don't have enough bytes to make a good determination of the encoding
292             if (count < 2 + 2) {
293                 return new Object[] { Encoding.UTF_8, null };
294             }
295             // other encodings
296             int b3 = b4[3] & MASK_11111111;
297             if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == ASCII.LESS_THAN) {
298                 // UCS-4, big endian (1234)
299                 return new Object[] { Encoding.ISO_10646_UCS_4, Boolean.TRUE };
300             }
301             if (b0 == ASCII.LESS_THAN && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
302                 // UCS-4, little endian (4321)
303                 return new Object[] { Encoding.ISO_10646_UCS_4, Boolean.FALSE };
304             }
305             if (b0 == 0x00 && b1 == 0x00 && b2 == ASCII.LESS_THAN && b3 == 0x00) {
306                 // UCS-4, unusual octet order (2143)
307                 // REVISIT: What should this be?
308                 return new Object[] { Encoding.ISO_10646_UCS_4, null };
309             }
310             if (b0 == 0x00 && b1 == ASCII.LESS_THAN && b2 == 0x00 && b3 == 0x00) {
311                 // UCS-4, unusual octect order (3412)
312                 // REVISIT: What should this be?
313                 return new Object[] { Encoding.ISO_10646_UCS_4, null };
314             }
315             if (b0 == 0x00 && b1 == ASCII.LESS_THAN && b2 == 0x00 && b3 == ASCII.QUESTIONMARK) {
316                 // UTF-16, big-endian, no BOM
317                 // (or could turn out to be UCS-2...
318                 // REVISIT: What should this be?
319                 return new Object[] { Encoding.UTF_16_BIG_ENDIAN, Boolean.TRUE };
320             }
321             if (b0 == ASCII.LESS_THAN && b1 == 0x00 && b2 == ASCII.QUESTIONMARK && b3 == 0x00) {
322                 // UTF-16, little-endian, no BOM
323                 // (or could turn out to be UCS-2...
324                 return new Object[] { Encoding.UTF_16_LITTLE_ENDIAN, Boolean.FALSE };
325             }
326             if (b0 == EBCDIC.LESS_THAN && b1 == EBCDIC.QUESTIONMARK && b2 == EBCDIC.LOWER_X && b3 == EBCDIC.LOWER_M) {
327                 // EBCDIC
328                 // a la xerces, return CP037 instead of EBCDIC here
329                 return new Object[] { Encoding.CP037, null };
330             }
331             // default encoding
332             return new Object[] { Encoding.UTF_8, null };
333         }
334     }
335 }