Microsoft PowerPoint - seminar-11-Ch05-XML.ppt

00801075 李宣成

新奥尔良人, 纽约科技大学副教授, 主要讲授 Java XML 以及面向对象编程,Internet 方面国际知名的作家程序员和教育家,Java 创始人之一 http://www.elharo.com/ 他的网站 Café au Lait :(http://www.cafeaulait.org) 是互联网上最流行的独立 Java 网站之一 ; 他的另一个网站 Café con Leche :(http://www.cafeconleche.org) 则是最流行的 XML 站点之一他编写了二十余本书籍, 包括 Java I/O, Java Network Programming 和 XML in a Nutshell 等等

铺垫 XML: 可扩展标记语言, 为通用标记语言的一种, 是一种简单易用的数据存储语言与 HTML 相比, 更为严谨简洁, 且效能更佳例 : <?xml version="1.0" encoding="iso-8859-1"?> <book catalog="programming"> <title>beautiful Code</title> <author>leading Programmers</author> <year>2010</year> <price>99.0</price> </book>

铺垫 JDOM XOM: 两者都是 XML 的 API, 使得程序员可以通过 Java 来访问操作和输出 XML 数据 DTD: 文档类型定义, 一个 XML 文档可以引用外部的 DTD, 也可以自身在开头使用!DOCTYPE!ELEMENT!ATTLIST 等操作来定义 DTD, 它声明了 XML 文档的元素, 并描述了文档的格式一个 XML 文档可以携带一个 DTD 来测试它是否为有效的 XML 文档例 : <?xml version="1.0"?> <!DOCTYPE note [ <!ELEMENT note (to,from,heading,body)> <!ELEMENT to (#PCDATA)> <!ELEMENT from (#PCDATA)> <!ELEMENT heading (#PCDATA)> <!ELEMENT body (#PCDATA)> ]> 对应于这个 DTD 的定义, 合法的 XML 文档有如下的格式 : <note> <to>tove</to> <from>jani</from> <heading>reminder</heading> <body>don't forget me this weekend</body> </note>

铺垫 Namespace: 即文中所说的命名空间, 它的使用是为了区分不同 XML 应用中同名的元素和属性或者在应用中归类相关的元素和属性, 让程序能容易地识别它们命名空间在操作上是将每个元素和属性加上一个前置字, 每个前置字都对应到一个 URI 附属于同一个 URI 的元素和属性, 会属于同一个命名空间, 标准 URI 可用来识别不同 XML 应用的元素命名空间中的元素和属性之名称会包含一个冒号, 在它之前的所有东西, 称作前置字而之后的所有东西, 称作局部名字 <?xml version= 1.0 encoding= ISO-8859-1 standalone= yes?> <catalog> <book> <title> Beautiful Code </title> </book> <paint> <pt:title> 111</pt:title> </paint> </catalog>

确保 XML 文档正确性所需要的两个冗余的数据检查 : 1 输入验证, 验证 XML 文档的结构是否符合规范以及与其所携带的 DTD 所匹配 2 输出验证, 使用 DOM JDOM 等工具构造 XML 文档时检查传递给 API 的字符串在 XML 中的合法性 XML 名字的合法性约束 : 文档中不允许出现交迭的元素 ( 由于 JDOM 的机制不需考虑 ) 元素名属性名或者处理指令的名字必须是合法的 XML 名字局部名字不能包含冒号属性的命名空间不能与其父元素或者兄弟属性的明明空间冲突每一个 Unicode 的代理字符出现在一个代理对中处理指令数据不包含双字节字符串?>

例 5-1. 分析 XML 名字的 BNF 语法 BaseChar ::= [#x0041-#x005a] [#x0061-#x007a] [#x00c0- BNF 语法 : #x00d6] NameChar ::= Letter Digit '.' '-' '_' ':' CombiningChar 中的字代表字符 Extender 本身 Name ::= (Letter '_' ':') (NameChar)* < > 中的为必选项 Letter ::= BaseChar Ideographic [ ] 中的为可选项 Ideographic ::= [#x4e00-#x9fa5] #x3007 [#x3021-#x3029] { 中的为可重复 0 Digit ::= [#x0030-#x0039] [#x0660-#x0669] [#x06f0-#x06f9] 至无数次的项 [#x0966-#x096f] [#x09e6-#x09ef] [#x0a66-#x0a6f] 表示左右两边任 [#x0ae6-#x0aef] [#x0b66-#x0b6f] [#x0be7-#x0bef] 选一项 [#x0c66-#x0c6f] [#x0ce6-#x0cef] [#x0d66-#x0d6f] ::= 是被定义为的 [#x0e50-#x0e59] [#x0ed0-#x0ed9] [#x0f20-#x0f29] 意思 Extender ::= #x00b7 #x02d0 #x02d1 #x0387 #x0640 #x0e46 #x0ec6 #x3005 [#x3031-#x3035] [#x309d-#x309e] [#x30fc-#x30fe] [#x00d8-#x00f6] [#x00f8-#x00ff] [#x0100-#x0131] [#x0134-#x013e]... CombiningChar ::= [#x0300-#x0345] [#x0360-#x0361] [#x0483- #x0486] [#x0591-#x05a1] [#x05a3-#x05b9] [#x05bb-#x05bd] #x05bf [#x05c1-#x05c2] #x05c4 [#x064b-#x0652] #x0670 [#x06d6-#x06dc]...

private static String checkxmlname(string name) { // Cannot be empty or null if ((name == null) (name.length() == 0) (name.trim( ).equals(""))) { return "XML names cannot be null or empty"; 第// Cannot start with a number 一char first = name.charat(0); if (Character.isDigit(first)) { 个return "XML names cannot begin with a number."; 版 // Cannot start with a $ 本if (first == '$') { 的return "XML names cannot begin with a dollar sign ($)."; 名// Cannot start with a _ 字if (first == '-') { return "XML names cannot begin with a hyphen (-)."; 字符// Ensure valid content for (int i=0, len = name.length( ); i<len; i++) { 验char c = name.charat(i); 证if ((!Character.isLetterOrDigit(c)) && (c!= '-') && (c!= '$') && (c!= '_')) { return c + " is not allowed in XML names."; // We got here, so everything is OK return null;

版本一的缺陷 : 它使用 Java 自带的判定函数来判断字符是否合法由此导致两个问题 : Java 中这些函数的定义并没有完全遵照 XML 中关于字母和数字的定义即 Java 把一些 XML 中非法的字母视为合法, 或者正好相反 Java 中的验证规则在版本更新时会有所变化, 而 XML 的验证规则没有变化另外, 这个版本允许在名字中包含冒号, 这不利于维持命名空间的良好形式 P.S. XML 中不合法的字符可以将其转化为实体使用, 如 : &lt < &gt > &amp & &pos &quot * XML 标准 1.1 中建议文档作者使用在自然语言中有意义的字词作为 XML 名称, 并避免在名称中使用符号字符或空白字符注意 : 冒号 (:) 连字符 (-) 句号 (.) 下划线 (_) 和圆点 ( ) 是明确允许的详请参见 : http://www.w3china.org/translation/xml1p1cr20021015_cn. htm

private static String checkxmlname(string name) { // Cannot be empty or null if ((name == null) (name.length( ) == 0) (name.trim( ).equals(""))) { return "XML names cannot be null or empty"; // Cannot start with a number char first = name.charat(0); if (!isxmlnamestartcharacter(first)) { return "XML names cannot begin with the character \"" + first + "\""; // Ensure valid content for (int i=0, len = name.length( ); i<len; i++) { char c = name.charat(i); if (!isxmlnamecharacter(c)) { return "XML names cannot contain the character \"" + c + "\""; // We got here, so everything is OK return null;

以上程序中调用的几个函数 : public static boolean isxmlnamecharacter(char c) { return (isxmlletter(c) isxmldigit(c) c == '.' c == '-' c == '_' c == ':' isxmlcombiningchar(c) isxmlextender(c)); public static boolean isxmlnamestartcharacter(char c) { return (isxmlletter(c) c == '_' c ==':'); public static boolean isxmldigit(char c) { if (c >= 0x0030 && c <= 0x0039) return true; if (c >= 0x0660 && c <= 0x0669) return true; if (c >= 0x06F0 && c <= 0x06F9) return true; if (c >= 0x0966 && c <= 0x096F) return true; if (c >= 0x09E6 && c <= 0x09EF) return true; if (c >= 0x0A66 && c <= 0x0A6F) return true; if (c >= 0x0AE6 && c <= 0x0AEF) return true; if (c >= 0x0B66 && c <= 0x0B6F) return true; if (c >= 0x0BE7 && c <= 0x0BEF) return true; if (c >= 0x0C66 && c <= 0x0C6F) return true; if (c >= 0x0CE6 && c <= 0x0CEF) return true; if (c >= 0x0D66 && c <= 0x0D6F) return true; if (c >= 0x0E50 && c <= 0x0E59) return true; if (c >= 0x0ED0 && c <= 0x0ED9) return true; if (c >= 0x0F20 && c <= 0x0F29) return true; return false;

public static boolean isxmldigit(char c) { if (c < 0x0030) return false; if (c <= 0x0039) return true; if (c < 0x0660) return false; if (c <= 0x0669) return true; if (c < 0x06F0) return false; if (c <= 0x06F9) return true; if (c < 0x0966) return false; if (c <= 0x096F) return true; if (c < 0x09E6) return false; if (c <= 0x09EF) return true; if (c < 0x0A66) return false; if (c <= 0x0A6F) return true; if (c < 0x0AE6) return false; if (c <= 0x0AEF) return true; if (c < 0x0B66) return false; if (c <= 0x0B6F) return true; if (c < 0x0BE7) return false; if (c <= 0x0BEF) return true; if (c < 0x0C66) return false; if (c <= 0x0C6F) return true; if (c < 0x0CE6) return false; if (c <= 0x0CEF) return true; if (c < 0x0D66) return false; if (c <= 0x0D6F) return true; if (c < 0x0E50) return false; if (c <= 0x0E59) return true; if (c < 0x0ED0) return false; if (c <= 0x0ED9) return true; if (c < 0x0F20) return false; if (c <= 0x0F29) return true; 同时识别合法和非法字符 return false;

private Element( ) { static Element build(string name, String uri, String localname) { Element result = new Element( ); String prefix = ""; int colon = name.indexof(':'); if (colon >= 0) { prefix = name.substring(0, colon); result.prefix = prefix; result.localname = localname; // We do need to verify the URI here because parsers are // allowing relative URIs which XOM forbids, for reasons // of canonical XML if nothing else. But we only have to verify // that it's an absolute base URI. I don't have to verify // no conflicts. if (! "".equals(uri)) Verifier.checkAbsoluteURIReference(uri); result.uri = uri; return result;

验证 16 进制字符的 switch 语句 : switch(c) { case '0': return true; case '1': return true; case '2': return true; case 'd': return true; case 'e': return true; case 'f': return true; return false; >64K 大型 switch 语句直接查表, 按位储存信息

保存并复制二进制查找表 : <target name="compile-core" depends="prepare, compile-jaxen" description="compile the source code"> <javac srcdir="${build.src" destdir="${build.dest"> <classpath refid="compile.class.path"/> </javac> <copy file="${build.src/nu/xom/character s.dat" tofile="${build.dest/nu/xom/charac ters.dat"/> </target> 装载二进制查找表 : private static byte[] flags = null; static { ClassLoader loader = Verifier.class.getClassLoader( ); if (loader!= null) loadflags(loader); // If that didn't work, try a different ClassLoader if (flags == null) { loader = Thread.currentThread().getContextClassLoader( ); loadflags(loader); private static void loadflags(classloader loader) { DataInputStream in = null; try { InputStream raw = loader.getresourceasstream("nu/xom/characters.d if (raw == null) { throw new RuntimeException("Broken XOM installation: " + "could not load nu/xom/characters.dat"); in = new DataInputStream(raw); flags = new byte[65536]; in.readfully(flags); catch (IOException ex) { throw new RuntimeException("Broken XOM installation: " + "could not load nu/xom/characters.dat"); finally { try { if (in!= null) in.close( ); catch (IOException ex) { // no big deal

使用查表法验证名字 : private static void loadflags(classloader loader) { DataInputStream in = null; try { InputStream raw = loader.getresourceasstream("nu/xom/characters.dat"); if (raw == null) { throw new RuntimeException("Broken XOM installation: " + "could not load nu/xom/characters.dat"); in = new DataInputStream(raw); flags = new byte[65536]; in.readfully(flags); catch (IOException ex) { throw new RuntimeException("Broken XOM installation: " + "could not load nu/xom/characters.dat"); finally { try { if (in!= null) in.close( ); catch (IOException ex) { // no big deal

在缓存中记录命名空间的 URI private final static class URICache { private final static int LOAD = 6; private String[] cache = new String[LOAD]; private int position = 0; synchronized boolean contains(string s) { for (int i = 0; i < LOAD; i++) { // Here I'm assuming the namespace URIs are interned. // This is commonly but not always true. This won't // break if they haven't been. Using equals( ) instead // of == is faster when the namespace URIs haven't been // interned but slower if they have. if (s == cache[i]) { return true; return false; synchronized void put(string s) { cache[position] = s; position++; if (position == LOAD) position = 0;