Index: org/jdom/Verifier.java =================================================================== RCS file: /home/cvspublic/jdom/src/java/org/jdom/Verifier.java,v retrieving revision 1.56 diff -u -r1.56 Verifier.java --- org/jdom/Verifier.java 22 Nov 2007 07:00:38 -0000 1.56 +++ org/jdom/Verifier.java 5 Dec 2007 14:09:01 -0000 @@ -161,17 +161,17 @@ int ch = text.charAt(i); // Check if high part of a surrogate pair - if (ch >= 0xD800 && ch <= 0xDBFF) { + if (isHighSurrogate((char) ch)) { // Check if next char is the low-surrogate i++; if (i < len) { char low = text.charAt(i); - if (low < 0xDC00 || low > 0xDFFF) { + if (!isLowSurrogate(low)) { return "Illegal Surrogate Pair"; } // It's a good pair, calculate the true value of // the character to then fall thru to isXMLCharacter - ch = 0x10000 + (ch - 0xD800) * 0x400 + (low - 0xDC00); + ch = decodeSurrogatePair((char) ch, low); } else { return "Surrogate Pair Truncated"; @@ -515,6 +515,16 @@ // If we got here, everything is OK return null; } + /** + * This is a utility function to decode a non-BMP + * UTF-16 surrogate pair. + * @param high high 16 bits + * @param low low 16 bits + * @return decoded character + */ + public static int decodeSurrogatePair(char high, char low) { + return 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00); + } // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | // [-'()+,./:=?;*#@$_%] @@ -692,6 +702,28 @@ return false; } + + /** + * This is a function for determining whether the + * specified character is the high 16 bits in a + * UTF-16 surrogate pair. + * @param ch character to check + * @return true if the character is a high surrogate, false otherwise + */ + public static boolean isHighSurrogate(char ch) { + return (ch >= 0xD800 && ch <= 0xDBFF); + } + + /** + * This is a function for determining whether the + * specified character is the low 16 bits in a + * UTF-16 surrogate pair. + * @param ch character to check + * @return true if the character is a low surrogate, false otherwise. + */ + public static boolean isLowSurrogate(char ch) { + return (ch >= 0xDC00 && ch <= 0xDFFF); + } /** *

Index: org/jdom/output/XMLOutputter.java =================================================================== RCS file: /home/cvspublic/jdom/src/java/org/jdom/output/XMLOutputter.java,v retrieving revision 1.116 diff -u -r1.116 XMLOutputter.java --- org/jdom/output/XMLOutputter.java 10 Nov 2007 05:29:01 -0000 1.116 +++ org/jdom/output/XMLOutputter.java 5 Dec 2007 14:09:01 -0000 @@ -1337,16 +1337,18 @@ * * @param str String input to escape. * @return String with escaped content. + * @throws IOException if an entity can not be escaped */ - public String escapeAttributeEntities(String str) { + public String escapeAttributeEntities(String str) throws IOException { StringBuffer buffer; - char ch; + int ch, pos; String entity; EscapeStrategy strategy = currentFormat.escapeStrategy; buffer = null; for (int i = 0; i < str.length(); i++) { ch = str.charAt(i); + pos = i; switch(ch) { case '<' : entity = "<"; @@ -1375,7 +1377,25 @@ entity = " "; break; default : - if (strategy.shouldEscape(ch)) { + + if (strategy.shouldEscape((char) ch)) { + //make sure what we are escaping is not the + //beginning of a multi-byte character. + if(Verifier.isHighSurrogate((char) ch)) { + //this is a the high of a surrogate pair + i++; + if (i < str.length()) { + char low = str.charAt(i); + if(!Verifier.isLowSurrogate(low)) { + throw new IOException("Could not decode surrogate pair 0x" + + Integer.toHexString(ch) + " / 0x" + Integer.toHexString(low)); + } + ch = Verifier.decodeSurrogatePair((char) ch, low); + } else { + throw new IOException("Surrogate pair 0x" + + Integer.toHexString(ch) + " truncated"); + } + } entity = "&#x" + Integer.toHexString(ch) + ";"; } else { @@ -1390,13 +1410,13 @@ buffer = new StringBuffer(str.length() + 20); // Copy previous skipped characters and fall through // to pickup current character - buffer.append(str.substring(0, i)); + buffer.append(str.substring(0, pos)); buffer.append(entity); } } else { if (entity == null) { - buffer.append(ch); + buffer.append((char) ch); } else { buffer.append(entity); @@ -1419,18 +1439,20 @@ * * @param str String input to escape. * @return String with escaped content. + * @throws IOException if an entity can not be escaped */ - public String escapeElementEntities(String str) { + public String escapeElementEntities(String str) throws IOException { if (escapeOutput == false) return str; StringBuffer buffer; - char ch; + int ch, pos; String entity; EscapeStrategy strategy = currentFormat.escapeStrategy; buffer = null; for (int i = 0; i < str.length(); i++) { ch = str.charAt(i); + pos = i; switch(ch) { case '<' : entity = "<"; @@ -1448,7 +1470,26 @@ entity = currentFormat.lineSeparator; break; default : - if (strategy.shouldEscape(ch)) { + + if (strategy.shouldEscape((char) ch)) { + + //make sure what we are escaping is not the + //beginning of a multi-byte character. + if(Verifier.isHighSurrogate((char) ch)) { + //this is a the high of a surrogate pair + i++; + if (i < str.length()) { + char low = str.charAt(i); + if(!Verifier.isLowSurrogate(low)) { + throw new IOException("Could not decode surrogate pair 0x" + + Integer.toHexString(ch) + " / 0x" + Integer.toHexString(low)); + } + ch = Verifier.decodeSurrogatePair((char) ch, low); + } else { + throw new IOException("Surrogate pair 0x" + + Integer.toHexString(ch) + " truncated"); + } + } entity = "&#x" + Integer.toHexString(ch) + ";"; } else { @@ -1463,13 +1504,13 @@ buffer = new StringBuffer(str.length() + 20); // Copy previous skipped characters and fall through // to pickup current character - buffer.append(str.substring(0, i)); + buffer.append(str.substring(0, pos)); buffer.append(entity); } } else { if (entity == null) { - buffer.append(ch); + buffer.append((char) ch); } else { buffer.append(entity);