Index: org/jdom/Verifier.java =================================================================== RCS file: /home/cvspublic/jdom/src/java/org/jdom/Verifier.java,v retrieving revision 1.56 diff -u -r1.56 Verifier.java --- org/jdom/Verifier.java 22 Nov 2007 07:00:38 -0000 1.56 +++ org/jdom/Verifier.java 5 Dec 2007 14:09:01 -0000 @@ -161,17 +161,17 @@ int ch = text.charAt(i); // Check if high part of a surrogate pair - if (ch >= 0xD800 && ch <= 0xDBFF) { + if (isHighSurrogate((char) ch)) { // Check if next char is the low-surrogate i++; if (i < len) { char low = text.charAt(i); - if (low < 0xDC00 || low > 0xDFFF) { + if (!isLowSurrogate(low)) { return "Illegal Surrogate Pair"; } // It's a good pair, calculate the true value of // the character to then fall thru to isXMLCharacter - ch = 0x10000 + (ch - 0xD800) * 0x400 + (low - 0xDC00); + ch = decodeSurrogatePair((char) ch, low); } else { return "Surrogate Pair Truncated"; @@ -515,6 +515,16 @@ // If we got here, everything is OK return null; } + /** + * This is a utility function to decode a non-BMP + * UTF-16 surrogate pair. + * @param high high 16 bits + * @param low low 16 bits + * @return decoded character + */ + public static int decodeSurrogatePair(char high, char low) { + return 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00); + } // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | // [-'()+,./:=?;*#@$_%] @@ -692,6 +702,28 @@ return false; } + + /** + * This is a function for determining whether the + * specified character is the high 16 bits in a + * UTF-16 surrogate pair. + * @param ch character to check + * @return true if the character is a high surrogate, false otherwise + */ + public static boolean isHighSurrogate(char ch) { + return (ch >= 0xD800 && ch <= 0xDBFF); + } + + /** + * This is a function for determining whether the + * specified character is the low 16 bits in a + * UTF-16 surrogate pair. + * @param ch character to check + * @return true if the character is a low surrogate, false otherwise. + */ + public static boolean isLowSurrogate(char ch) { + return (ch >= 0xDC00 && ch <= 0xDFFF); + } /** *
Index: org/jdom/output/XMLOutputter.java
===================================================================
RCS file: /home/cvspublic/jdom/src/java/org/jdom/output/XMLOutputter.java,v
retrieving revision 1.116
diff -u -r1.116 XMLOutputter.java
--- org/jdom/output/XMLOutputter.java 10 Nov 2007 05:29:01 -0000 1.116
+++ org/jdom/output/XMLOutputter.java 5 Dec 2007 14:09:01 -0000
@@ -1337,16 +1337,18 @@
*
* @param str String
input to escape.
* @return String
with escaped content.
+ * @throws IOException if an entity can not be escaped
*/
- public String escapeAttributeEntities(String str) {
+ public String escapeAttributeEntities(String str) throws IOException {
StringBuffer buffer;
- char ch;
+ int ch, pos;
String entity;
EscapeStrategy strategy = currentFormat.escapeStrategy;
buffer = null;
for (int i = 0; i < str.length(); i++) {
ch = str.charAt(i);
+ pos = i;
switch(ch) {
case '<' :
entity = "<";
@@ -1375,7 +1377,25 @@
entity = "
";
break;
default :
- if (strategy.shouldEscape(ch)) {
+
+ if (strategy.shouldEscape((char) ch)) {
+ //make sure what we are escaping is not the
+ //beginning of a multi-byte character.
+ if(Verifier.isHighSurrogate((char) ch)) {
+ //this is a the high of a surrogate pair
+ i++;
+ if (i < str.length()) {
+ char low = str.charAt(i);
+ if(!Verifier.isLowSurrogate(low)) {
+ throw new IOException("Could not decode surrogate pair 0x" +
+ Integer.toHexString(ch) + " / 0x" + Integer.toHexString(low));
+ }
+ ch = Verifier.decodeSurrogatePair((char) ch, low);
+ } else {
+ throw new IOException("Surrogate pair 0x" +
+ Integer.toHexString(ch) + " truncated");
+ }
+ }
entity = "" + Integer.toHexString(ch) + ";";
}
else {
@@ -1390,13 +1410,13 @@
buffer = new StringBuffer(str.length() + 20);
// Copy previous skipped characters and fall through
// to pickup current character
- buffer.append(str.substring(0, i));
+ buffer.append(str.substring(0, pos));
buffer.append(entity);
}
}
else {
if (entity == null) {
- buffer.append(ch);
+ buffer.append((char) ch);
}
else {
buffer.append(entity);
@@ -1419,18 +1439,20 @@
*
* @param str String
input to escape.
* @return String
with escaped content.
+ * @throws IOException if an entity can not be escaped
*/
- public String escapeElementEntities(String str) {
+ public String escapeElementEntities(String str) throws IOException {
if (escapeOutput == false) return str;
StringBuffer buffer;
- char ch;
+ int ch, pos;
String entity;
EscapeStrategy strategy = currentFormat.escapeStrategy;
buffer = null;
for (int i = 0; i < str.length(); i++) {
ch = str.charAt(i);
+ pos = i;
switch(ch) {
case '<' :
entity = "<";
@@ -1448,7 +1470,26 @@
entity = currentFormat.lineSeparator;
break;
default :
- if (strategy.shouldEscape(ch)) {
+
+ if (strategy.shouldEscape((char) ch)) {
+
+ //make sure what we are escaping is not the
+ //beginning of a multi-byte character.
+ if(Verifier.isHighSurrogate((char) ch)) {
+ //this is a the high of a surrogate pair
+ i++;
+ if (i < str.length()) {
+ char low = str.charAt(i);
+ if(!Verifier.isLowSurrogate(low)) {
+ throw new IOException("Could not decode surrogate pair 0x" +
+ Integer.toHexString(ch) + " / 0x" + Integer.toHexString(low));
+ }
+ ch = Verifier.decodeSurrogatePair((char) ch, low);
+ } else {
+ throw new IOException("Surrogate pair 0x" +
+ Integer.toHexString(ch) + " truncated");
+ }
+ }
entity = "" + Integer.toHexString(ch) + ";";
}
else {
@@ -1463,13 +1504,13 @@
buffer = new StringBuffer(str.length() + 20);
// Copy previous skipped characters and fall through
// to pickup current character
- buffer.append(str.substring(0, i));
+ buffer.append(str.substring(0, pos));
buffer.append(entity);
}
}
else {
if (entity == null) {
- buffer.append(ch);
+ buffer.append((char) ch);
}
else {
buffer.append(entity);