1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.bea.domingo.util;
23
24 import java.io.UnsupportedEncodingException;
25 import java.util.Properties;
26
27 /***
28 * @author <a href="mailto:kriede@users.sourceforge.net">Kurt Riede</a>
29 */
30 public final class XMLUtil {
31
32 private static final String XML_HEADER_SAMPLE = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ";
33
34 private static final String XML_START = "<?xml";
35
36 private static final String XML_END = "?>";
37
38 private XMLUtil() {
39 }
40
41 /***
42 * Parses a byte array that represents an XML instance document to a String
43 * with respect to encoding.
44 *
45 * @param bytes the bytes to parse
46 * @return parsed string
47 */
48 public static String parse(final byte[] bytes) {
49 final byte[] b4 = new byte[XML_START.length() - 1];
50 int count = 0;
51 for (; count < b4.length; count++) {
52 b4[count] = (byte) bytes[count];
53 }
54 String encoding = (String) IANAEncoding.getEncodingName(b4, count)[0];
55 String preparsedResult = null;
56 try {
57 int min = Math.min(bytes.length, 2 * XML_HEADER_SAMPLE.length());
58 preparsedResult = new String(bytes, 0, min, encoding);
59 } catch (UnsupportedEncodingException e) {
60 try {
61 preparsedResult = new String(bytes, "UTF-8");
62 } catch (UnsupportedEncodingException e1) {
63 preparsedResult = new String(bytes);
64 }
65 }
66 XMLHeaderParser xmlHeaderParser = new XMLHeaderParser(preparsedResult);
67 try {
68 String xmlEncoding = xmlHeaderParser.getEncoding();
69 return new String(bytes, xmlEncoding);
70 } catch (UnsupportedEncodingException e) {
71 }
72 try {
73 return new String(bytes, "UTF-8");
74 } catch (UnsupportedEncodingException e) {
75 return new String(bytes);
76 }
77 }
78
79 private static final class XMLHeaderParser {
80
81 /*** BOM for encoding in big endian. */
82 private static final int BOM_BIG_ENDIAN = 0xfffe;
83
84 /*** BOM for encoding in little endian. */
85 private static final int BOM_LITTLE_ENDIAN = 0xfeff;
86
87 private Properties fPseudoAttributes = new Properties();
88
89 /*** Position in preparsed string. */
90 private int fPosition = 0;
91
92 /*** byte array of input to parse. */
93 private final String fPreparsedString;
94
95 /***
96 * Constructor.
97 */
98 public XMLHeaderParser(final String preparsedString) {
99 fPreparsedString = preparsedString;
100 }
101
102 /***
103 * @param preparsedResult
104 * @return
105 * @throws XmlHeaderParserException if the XML header is inavlid
106 */
107 private String getEncoding() throws UnsupportedEncodingException {
108 skipBOM();
109 skip(XML_START);
110 parsePseudoAttribute();
111 parsePseudoAttribute();
112 parsePseudoAttribute();
113 skip(XML_END);
114 return (String) fPseudoAttributes.get("encoding");
115 }
116
117 /***
118 * Parses and skips the optional byte order mask (BOM).
119 */
120 private void skipBOM() {
121 char c = fPreparsedString.charAt(0);
122 if (c == BOM_LITTLE_ENDIAN || c == BOM_BIG_ENDIAN) {
123 fPosition++;
124 }
125 }
126
127 /***
128 * Parses a pseudo attribute as used in the XML declaration.
129 *
130 * @return String representing the pseudo attribute
131 * @throws XmlHeaderParserException if the XML header is inavlid
132 */
133 private void parsePseudoAttribute() throws UnsupportedEncodingException {
134 skipWhiteSpace();
135 if (fPreparsedString.charAt(fPosition) == '?') {
136 return;
137 }
138 String name = fPreparsedString.substring(fPosition, fPreparsedString.indexOf('=', fPosition));
139 fPosition += name.length() + 1;
140 skipQuote();
141 String value = fPreparsedString.substring(fPosition, fPreparsedString.indexOf('\"', fPosition));
142 fPosition += value.length();
143 skipQuote();
144 fPseudoAttributes.put(name, value);
145 }
146
147 /***
148 * @throws XmlHeaderParserException if there is no quote at the current position
149 */
150 private void skipQuote() throws UnsupportedEncodingException {
151 if (fPreparsedString.charAt(fPosition) == '\"') {
152 fPosition++;
153 return;
154 }
155 throw new UnsupportedEncodingException("expected quote at character " + fPosition);
156 }
157
158 /***
159 *
160 */
161 private void skipWhiteSpace() {
162 while (isWhitespace(fPreparsedString.charAt(fPosition))) {
163 fPosition++;
164 }
165 }
166
167 /***
168 * Checks if a given character is a white-space character.
169 *
170 * @param c the charcter to check
171 * @return <code>true</code> if the character is a white-space, else <code>false</code>
172 */
173 private boolean isWhitespace(final char c) {
174 return c == ' ' || c == '\t';
175 }
176
177 /***
178 * Parses a constant string.
179 *
180 * @param s the string to parse
181 * @throws XmlHeaderParserException if the expected string wasn't found
182 */
183 private void skip(final String s) throws UnsupportedEncodingException {
184 if (fPreparsedString.length() - fPosition < s.length()) {
185 throw new UnsupportedEncodingException("expected " + s + " at character " + fPosition);
186 }
187 String substring = fPreparsedString.substring(fPosition, fPosition + s.length());
188 if (substring.equals(s)) {
189 fPosition += s.length();
190 return;
191 }
192 throw new UnsupportedEncodingException("expected " + s + " at character " + fPosition);
193 }
194 }
195
196 private static final class Encoding {
197
198 /*** Encoding name: UTF-8. */
199 private static final String UTF_8 = "UTF-8";
200
201 /*** Encoding name: UTF-16 with big endian. */
202 private static final String UTF_16_BIG_ENDIAN = "UTF-16BE";
203
204 /*** Encoding name: UTF-16 with little endian. */
205 private static final String UTF_16_LITTLE_ENDIAN = "UTF-16LE";
206
207 /*** Encoding name: ISO-10646-UCS-4. */
208 private static final String ISO_10646_UCS_4 = "ISO-10646-UCS-4";
209
210 /*** Encoding name: CP037 (EBCDIC). */
211 private static final String CP037 = "CP037";
212 }
213
214 private static final class EBCDIC {
215
216 /*** EBCDIC character less-than ("<"). */
217 private static final int LESS_THAN = 0x4C;
218
219 /*** EBCDIC character questionmark ("?"). */
220 private static final int QUESTIONMARK = 0x6F;
221
222 /*** EBCDIC lower case character x. */
223 private static final int LOWER_X = 0xA7;
224
225 /*** EBCDIC lower case character m. */
226 private static final int LOWER_M = 0x94;
227 }
228
229 private static final class ASCII {
230
231 /*** ASCII character less-than ("<"). */
232 private static final int LESS_THAN = 0x3C;
233
234 /*** ASCII character questionmark ("?"). */
235 private static final int QUESTIONMARK = 0x3F;
236 }
237
238 private static final class IANAEncoding {
239
240 /*** Bit mask: <tt>10111011</tt>, character.*/
241 private static final int MASK_10111011 = 0xBB;
242
243 /*** Bit mask: <tt>10111111</tt>, character.*/
244 private static final int MASK_10111111 = 0xBF;
245
246 /*** Bit mask: <tt>11111111</tt>.*/
247 private static final int MASK_11111111 = 0xFF;
248
249 /*** Bit mask: <tt>11111110</tt>.*/
250 private static final int MASK_11111110 = 0xFE;
251
252 /*** Bit mask: <tt>11101111</tt>.*/
253 private static final int MASK_11101111 = 0xEF;
254
255 private IANAEncoding() {
256 }
257
258 /***
259 * Returns the IANA encoding name that is auto-detected from the bytes
260 * specified, with the endian-ness of that encoding where appropriate.
261 *
262 * @param b4 The first four bytes of the input.
263 * @param count The number of bytes actually read.
264 * @return a 2-element array: the first element, an IANA-encoding string,
265 * the second element a Boolean which is true iff the document is
266 * big endian, false if it's little-endian, and null if the
267 * distinction isn't relevant.
268 */
269 private static Object[] getEncodingName(final byte[] b4, final int count) {
270 if (count < 2) {
271 return new Object[] { Encoding.UTF_8, null };
272 }
273
274 int b0 = b4[0] & MASK_11111111;
275 int b1 = b4[1] & MASK_11111111;
276 if (b0 == MASK_11111110 && b1 == MASK_11111111) {
277 return new Object[] { Encoding.UTF_16_BIG_ENDIAN, Boolean.TRUE };
278 }
279 if (b0 == MASK_11111111 && b1 == MASK_11111110) {
280 return new Object[] { Encoding.UTF_16_LITTLE_ENDIAN, Boolean.FALSE };
281 }
282
283 if (count < 3) {
284 return new Object[] { Encoding.UTF_8, null };
285 }
286
287 int b2 = b4[2] & MASK_11111111;
288 if (b0 == MASK_11101111 && b1 == MASK_10111011 && b2 == MASK_10111111) {
289 return new Object[] { Encoding.UTF_8, null };
290 }
291
292 if (count < 2 + 2) {
293 return new Object[] { Encoding.UTF_8, null };
294 }
295
296 int b3 = b4[3] & MASK_11111111;
297 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == ASCII.LESS_THAN) {
298
299 return new Object[] { Encoding.ISO_10646_UCS_4, Boolean.TRUE };
300 }
301 if (b0 == ASCII.LESS_THAN && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
302
303 return new Object[] { Encoding.ISO_10646_UCS_4, Boolean.FALSE };
304 }
305 if (b0 == 0x00 && b1 == 0x00 && b2 == ASCII.LESS_THAN && b3 == 0x00) {
306
307
308 return new Object[] { Encoding.ISO_10646_UCS_4, null };
309 }
310 if (b0 == 0x00 && b1 == ASCII.LESS_THAN && b2 == 0x00 && b3 == 0x00) {
311
312
313 return new Object[] { Encoding.ISO_10646_UCS_4, null };
314 }
315 if (b0 == 0x00 && b1 == ASCII.LESS_THAN && b2 == 0x00 && b3 == ASCII.QUESTIONMARK) {
316
317
318
319 return new Object[] { Encoding.UTF_16_BIG_ENDIAN, Boolean.TRUE };
320 }
321 if (b0 == ASCII.LESS_THAN && b1 == 0x00 && b2 == ASCII.QUESTIONMARK && b3 == 0x00) {
322
323
324 return new Object[] { Encoding.UTF_16_LITTLE_ENDIAN, Boolean.FALSE };
325 }
326 if (b0 == EBCDIC.LESS_THAN && b1 == EBCDIC.QUESTIONMARK && b2 == EBCDIC.LOWER_X && b3 == EBCDIC.LOWER_M) {
327
328
329 return new Object[] { Encoding.CP037, null };
330 }
331
332 return new Object[] { Encoding.UTF_8, null };
333 }
334 }
335 }