| 1 | package dolda.jsvc.next; |
| 2 | |
| 3 | import java.io.*; |
| 4 | import java.util.*; |
| 5 | import org.w3c.dom.*; |
| 6 | import org.w3c.dom.bootstrap.*; |
| 7 | |
| 8 | public class Parser { |
| 9 | private static final DOMImplementation domimp; |
| 10 | |
| 11 | static { |
| 12 | DOMImplementationRegistry reg; |
| 13 | try { |
| 14 | reg = DOMImplementationRegistry.newInstance(); |
| 15 | } catch(Exception e) { |
| 16 | throw(new Error(e)); |
| 17 | } |
| 18 | DOMImplementation di = reg.getDOMImplementation(""); |
| 19 | if(di == null) |
| 20 | throw(new RuntimeException("Could not get a DOM implemenation")); |
| 21 | domimp = di; |
| 22 | } |
| 23 | |
| 24 | private static boolean namechar(char c) { |
| 25 | return((c == ':') || (c == '_') || (c == '$') || (c == '.') || (c == '-') || ((c >= '0') && (c <= '9')) || ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z'))); |
| 26 | } |
| 27 | |
| 28 | protected String entity(String name) { |
| 29 | if(name.equals("amp")) |
| 30 | return("&"); |
| 31 | if(name.equals("lt")) |
| 32 | return("<"); |
| 33 | if(name.equals("gt")) |
| 34 | return(">"); |
| 35 | if(name.equals("apos")) |
| 36 | return("'"); |
| 37 | if(name.equals("quot")) |
| 38 | return("\""); |
| 39 | return(null); |
| 40 | } |
| 41 | |
| 42 | protected Element makenode(Document doc, String name) { |
| 43 | return(doc.createElementNS(null, name)); |
| 44 | } |
| 45 | |
| 46 | protected Attr makeattr(Document doc, Element el, String name) { |
| 47 | return(doc.createAttributeNS(el.getNamespaceURI(), name)); |
| 48 | } |
| 49 | |
| 50 | public DocumentFragment parse(Reader in) throws IOException { |
| 51 | Stack<Node> stack = new Stack<Node>(); |
| 52 | Document doc = domimp.createDocument(null, "dummy", null); |
| 53 | DocumentFragment frag = doc.createDocumentFragment(); |
| 54 | stack.push(frag); |
| 55 | String st = "content"; |
| 56 | int c = in.read(); |
| 57 | StringBuilder buf = new StringBuilder(); |
| 58 | StringBuilder ebuf = new StringBuilder(); |
| 59 | char atype = 0; |
| 60 | int cdashcnt = 0; |
| 61 | while(true) { |
| 62 | if(st == "content") { |
| 63 | if(c == '<') { |
| 64 | st = "tag"; |
| 65 | c = in.read(); |
| 66 | } else if(c < 0) { |
| 67 | if(stack.peek() == frag) |
| 68 | return(frag); |
| 69 | else |
| 70 | throw(new ParseException("Unexpected end-of-file while parsing non-root element")); |
| 71 | } else { |
| 72 | st = "text"; |
| 73 | } |
| 74 | } else if(st == "tag") { |
| 75 | if(Character.isWhitespace((char)c)) { |
| 76 | c = in.read(); |
| 77 | } else if(c == '!') { |
| 78 | cdashcnt = 0; |
| 79 | c = in.read(); |
| 80 | st = "comment"; |
| 81 | } else if(namechar((char)c)) { |
| 82 | st = "stag"; |
| 83 | } else if(c == '/') { |
| 84 | c = in.read(); |
| 85 | st = "etag"; |
| 86 | } else if(c < 0) { |
| 87 | throw(new ParseException("Unexpected end-of-file while parsing tag")); |
| 88 | } else { |
| 89 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name")); |
| 90 | } |
| 91 | } else if(st == "stag") { |
| 92 | boolean flush = false; |
| 93 | if(namechar((char)c)) { |
| 94 | buf.append((char)c); |
| 95 | c = in.read(); |
| 96 | } else if(c == '>') { |
| 97 | flush = true; |
| 98 | } else if(Character.isWhitespace((char)c)) { |
| 99 | flush = true; |
| 100 | c = in.read(); |
| 101 | } else if(c < 0) { |
| 102 | throw(new ParseException("Unexpected end-of-file while parsing tag name")); |
| 103 | } else { |
| 104 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name")); |
| 105 | } |
| 106 | if(flush) { |
| 107 | Element n = makenode(doc, buf.toString()); |
| 108 | buf = new StringBuilder(); |
| 109 | stack.peek().appendChild(n); |
| 110 | stack.push(n); |
| 111 | st = "attr"; |
| 112 | } |
| 113 | } else if(st == "comment") { |
| 114 | if(c == '-') { |
| 115 | cdashcnt++; |
| 116 | c = in.read(); |
| 117 | } else if((c == '>') && (cdashcnt == 4)) { |
| 118 | stack.peek().appendChild(doc.createComment(buf.toString())); |
| 119 | buf = new StringBuilder(); |
| 120 | st = "content"; |
| 121 | c = in.read(); |
| 122 | } else if(cdashcnt >= 2) { |
| 123 | if(cdashcnt > 2) |
| 124 | cdashcnt = 2; |
| 125 | buf.append((char)c); |
| 126 | c = in.read(); |
| 127 | } else if(c < 0) { |
| 128 | throw(new ParseException("Unexpected end-of-file while parsing comment")); |
| 129 | } else { |
| 130 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in comment")); |
| 131 | } |
| 132 | } else if(st == "attr") { |
| 133 | if(namechar((char)c)) { |
| 134 | st = "aname"; |
| 135 | } else if(c == '>') { |
| 136 | st = "content"; |
| 137 | c = in.read(); |
| 138 | } else if(c == '/') { |
| 139 | st = "stagend"; |
| 140 | c = in.read(); |
| 141 | } else if(Character.isWhitespace((char)c)) { |
| 142 | c = in.read(); |
| 143 | } else if(c < 0) { |
| 144 | throw(new ParseException("Unexpected end-of-file while parsing attributes")); |
| 145 | } else { |
| 146 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered inside tag")); |
| 147 | } |
| 148 | } else if(st == "stagend") { |
| 149 | if(c == '>') { |
| 150 | stack.pop(); |
| 151 | c = in.read(); |
| 152 | st = "content"; |
| 153 | } else if(Character.isWhitespace((char)c)) { |
| 154 | c = in.read(); |
| 155 | } else if(c < 0) { |
| 156 | throw(new ParseException("Unexpected end-of-file at end of empty tag")); |
| 157 | } else { |
| 158 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered at and of empty tag")); |
| 159 | } |
| 160 | } else if(st == "aname") { |
| 161 | if(namechar((char)c)) { |
| 162 | buf.append((char)c); |
| 163 | c = in.read(); |
| 164 | } else if(Character.isWhitespace((char)c)) { |
| 165 | c = in.read(); |
| 166 | } else if(c == '=') { |
| 167 | Element el = (Element)stack.peek(); |
| 168 | Attr attr = makeattr(doc, el, buf.toString()); |
| 169 | el.setAttributeNodeNS(attr); |
| 170 | buf = new StringBuilder(); |
| 171 | stack.push(attr); |
| 172 | st = "avalstart"; |
| 173 | c = in.read(); |
| 174 | } else if(c < 0) { |
| 175 | throw(new ParseException("Unexpected end-of-file while parsing attribute name")); |
| 176 | } else { |
| 177 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute name")); |
| 178 | } |
| 179 | } else if(st == "avalstart") { |
| 180 | if((c == '\'') || (c == '"')) { |
| 181 | atype = (char)c; |
| 182 | c = in.read(); |
| 183 | st = "aval"; |
| 184 | } else if(Character.isWhitespace((char)c)) { |
| 185 | c = in.read(); |
| 186 | } else if(c < 0) { |
| 187 | throw(new ParseException("Unexpected end-of-file while parsing attribute value")); |
| 188 | } else { |
| 189 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute value")); |
| 190 | } |
| 191 | } else if(st == "aval") { |
| 192 | if(c == atype) { |
| 193 | c = in.read(); |
| 194 | Attr a = (Attr)stack.pop(); |
| 195 | a.setValue(buf.toString()); |
| 196 | buf = new StringBuilder(); |
| 197 | st = "attr"; |
| 198 | } else if(c == '&') { |
| 199 | c = in.read(); |
| 200 | st = "aent"; |
| 201 | } else if(c < 0) { |
| 202 | throw(new ParseException("Unexpected end-of-file while parsing attribute value")); |
| 203 | } else { |
| 204 | buf.append((char)c); |
| 205 | c = in.read(); |
| 206 | } |
| 207 | } else if(st == "etag") { |
| 208 | if(namechar((char)c)) { |
| 209 | buf.append((char)c); |
| 210 | c = in.read(); |
| 211 | } else if(c == '>') { |
| 212 | String nm = buf.toString(); |
| 213 | buf = new StringBuilder(); |
| 214 | Node n = stack.pop(); |
| 215 | if(n instanceof DocumentFragment) |
| 216 | throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing root content")); |
| 217 | Element el = (Element)n; |
| 218 | if(!nm.equals(el.getTagName())) |
| 219 | throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing `" + el.getTagName() + "'")); |
| 220 | c = in.read(); |
| 221 | st = "content"; |
| 222 | } else if(c < 0) { |
| 223 | throw(new ParseException("Unexpected end-of-file while parsing end tag")); |
| 224 | } else { |
| 225 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in end tag")); |
| 226 | } |
| 227 | } else if(st == "text") { |
| 228 | boolean flush = false; |
| 229 | if(c == '&') { |
| 230 | st = "ent"; |
| 231 | c = in.read(); |
| 232 | } else if(c == '<') { |
| 233 | flush = true; |
| 234 | st = "content"; |
| 235 | } else if(c < 0) { |
| 236 | flush = true; |
| 237 | st = "content"; |
| 238 | } else { |
| 239 | buf.append((char)c); |
| 240 | c = in.read(); |
| 241 | } |
| 242 | if(flush) { |
| 243 | Text n = doc.createTextNode(buf.toString()); |
| 244 | buf = new StringBuilder(); |
| 245 | stack.peek().appendChild(n); |
| 246 | } |
| 247 | } else if(st == "ent") { |
| 248 | if(c == ';') { |
| 249 | String ename = ebuf.toString(); |
| 250 | ebuf = new StringBuilder(); |
| 251 | String rep = entity(ename); |
| 252 | if(rep == null) |
| 253 | throw(new ParseException("Unknown entity `" + ename + "' encountered")); |
| 254 | buf.append(rep); |
| 255 | st = "text"; |
| 256 | c = in.read(); |
| 257 | } else if(c < 0) { |
| 258 | throw(new ParseException("Unexpected end-of-file while parsing entity name")); |
| 259 | } else if(namechar((char)c)) { |
| 260 | ebuf.append((char)c); |
| 261 | c = in.read(); |
| 262 | } else { |
| 263 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name")); |
| 264 | } |
| 265 | } else if(st == "aent") { |
| 266 | if(c == ';') { |
| 267 | String ename = ebuf.toString(); |
| 268 | ebuf = new StringBuilder(); |
| 269 | String rep = entity(ename); |
| 270 | if(rep == null) |
| 271 | throw(new ParseException("Unknown entity `" + ename + "' encountered")); |
| 272 | buf.append(rep); |
| 273 | st = "aval"; |
| 274 | c = in.read(); |
| 275 | } else if(c < 0) { |
| 276 | throw(new ParseException("Unexpected end-of-file while parsing entity name")); |
| 277 | } else if(namechar((char)c)) { |
| 278 | ebuf.append((char)c); |
| 279 | c = in.read(); |
| 280 | } else { |
| 281 | throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name")); |
| 282 | } |
| 283 | } else { |
| 284 | throw(new Error("BUG: Typoed state " + st)); |
| 285 | } |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | private static String printable(char c) { |
| 290 | if(c < 32) |
| 291 | return(String.format("\\%03o", (int)c)); |
| 292 | return(Character.toString(c)); |
| 293 | } |
| 294 | |
| 295 | public static void main(String[] args) throws Exception { |
| 296 | Parser p = new Parser(); |
| 297 | DocumentFragment f = p.parse(new FileReader(args[0])); |
| 298 | javax.xml.transform.TransformerFactory fac = javax.xml.transform.TransformerFactory.newInstance(); |
| 299 | fac.setAttribute("indent-number", 2); |
| 300 | javax.xml.transform.Transformer t = fac.newTransformer(); |
| 301 | t.setOutputProperty(javax.xml.transform.OutputKeys.INDENT, "yes"); |
| 302 | t.transform(new javax.xml.transform.dom.DOMSource(f), new javax.xml.transform.stream.StreamResult(System.out)); |
| 303 | System.out.println(t.getClass()); |
| 304 | } |
| 305 | } |