domimp = di;
}
+ public class State {
+ public final Document doc = domimp.createDocument(null, "dummy", null);
+ public final PeekReader in;
+
+ private State(Reader in) {
+ this.in = new PeekReader(in);
+ }
+ }
+
private static boolean namechar(char c) {
return((c == ':') || (c == '_') || (c == '$') || (c == '.') || (c == '-') || ((c >= '0') && (c <= '9')) || ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')));
}
return(doc.createElementNS(null, name));
}
+ protected Attr makeattr(Document doc, Element el, String name, String val) {
+ Attr a = doc.createAttributeNS(el.getNamespaceURI(), name);
+ a.setValue(val);
+ return(a);
+ }
+
protected Attr makeattr(Document doc, Element el, String name) {
return(doc.createAttributeNS(el.getNamespaceURI(), name));
}
- public DocumentFragment parse(Reader in) throws IOException {
- Stack<Node> stack = new Stack<Node>();
- Document doc = domimp.createDocument(null, "dummy", null);
- DocumentFragment frag = doc.createDocumentFragment();
- stack.push(frag);
- String st = "content";
- int c = in.read();
+ protected String name(State s) throws IOException {
StringBuilder buf = new StringBuilder();
- StringBuilder ebuf = new StringBuilder();
- char atype = 0;
- int cdashcnt = 0;
while(true) {
- if(st == "content") {
- if(c == '<') {
- st = "tag";
- c = in.read();
- } else if(c < 0) {
- if(stack.peek() == frag)
- return(frag);
- else
- throw(new ParseException("Unexpected end-of-file while parsing non-root element"));
- } else {
- st = "text";
- }
- } else if(st == "tag") {
- if(Character.isWhitespace((char)c)) {
- c = in.read();
- } else if(c == '!') {
- cdashcnt = 0;
- c = in.read();
- st = "comment";
- } else if(namechar((char)c)) {
- st = "stag";
- } else if(c == '/') {
- c = in.read();
- st = "etag";
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing tag"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name"));
- }
- } else if(st == "stag") {
- boolean flush = false;
- if(namechar((char)c)) {
- buf.append((char)c);
- c = in.read();
- } else if(c == '>') {
- flush = true;
- } else if(Character.isWhitespace((char)c)) {
- flush = true;
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing tag name"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in tag name"));
- }
- if(flush) {
- Element n = makenode(doc, buf.toString());
- buf = new StringBuilder();
- stack.peek().appendChild(n);
- stack.push(n);
- st = "attr";
- }
- } else if(st == "comment") {
- if(c == '-') {
- cdashcnt++;
- c = in.read();
- } else if((c == '>') && (cdashcnt == 4)) {
- stack.peek().appendChild(doc.createComment(buf.toString()));
- buf = new StringBuilder();
- st = "content";
- c = in.read();
- } else if(cdashcnt >= 2) {
- if(cdashcnt > 2)
- cdashcnt = 2;
- buf.append((char)c);
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing comment"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in comment"));
- }
- } else if(st == "attr") {
- if(namechar((char)c)) {
- st = "aname";
- } else if(c == '>') {
- st = "content";
- c = in.read();
- } else if(c == '/') {
- st = "stagend";
- c = in.read();
- } else if(Character.isWhitespace((char)c)) {
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing attributes"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered inside tag"));
- }
- } else if(st == "stagend") {
- if(c == '>') {
- stack.pop();
- c = in.read();
- st = "content";
- } else if(Character.isWhitespace((char)c)) {
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file at end of empty tag"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered at and of empty tag"));
- }
- } else if(st == "aname") {
- if(namechar((char)c)) {
- buf.append((char)c);
- c = in.read();
- } else if(Character.isWhitespace((char)c)) {
- c = in.read();
- } else if(c == '=') {
- Element el = (Element)stack.peek();
- Attr attr = makeattr(doc, el, buf.toString());
- el.setAttributeNodeNS(attr);
- buf = new StringBuilder();
- stack.push(attr);
- st = "avalstart";
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing attribute name"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute name"));
- }
- } else if(st == "avalstart") {
- if((c == '\'') || (c == '"')) {
- atype = (char)c;
- c = in.read();
- st = "aval";
- } else if(Character.isWhitespace((char)c)) {
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing attribute value"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in attribute value"));
- }
- } else if(st == "aval") {
- if(c == atype) {
- c = in.read();
- Attr a = (Attr)stack.pop();
- a.setValue(buf.toString());
- buf = new StringBuilder();
- st = "attr";
- } else if(c == '&') {
- c = in.read();
- st = "aent";
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing attribute value"));
- } else {
- buf.append((char)c);
- c = in.read();
- }
- } else if(st == "etag") {
- if(namechar((char)c)) {
- buf.append((char)c);
- c = in.read();
- } else if(c == '>') {
- String nm = buf.toString();
- buf = new StringBuilder();
- Node n = stack.pop();
- if(n instanceof DocumentFragment)
- throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing root content"));
- Element el = (Element)n;
- if(!nm.equals(el.getTagName()))
- throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing `" + el.getTagName() + "'"));
- c = in.read();
- st = "content";
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing end tag"));
- } else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in end tag"));
- }
- } else if(st == "text") {
- boolean flush = false;
- if(c == '&') {
- st = "ent";
- c = in.read();
- } else if(c == '<') {
- flush = true;
- st = "content";
- } else if(c < 0) {
- flush = true;
- st = "content";
- } else {
- buf.append((char)c);
- c = in.read();
- }
- if(flush) {
- Text n = doc.createTextNode(buf.toString());
- buf = new StringBuilder();
- stack.peek().appendChild(n);
- }
- } else if(st == "ent") {
- if(c == ';') {
- String ename = ebuf.toString();
- ebuf = new StringBuilder();
- String rep = entity(ename);
- if(rep == null)
- throw(new ParseException("Unknown entity `" + ename + "' encountered"));
- buf.append(rep);
- st = "text";
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing entity name"));
- } else if(namechar((char)c)) {
- ebuf.append((char)c);
- c = in.read();
+ int c = s.in.peek();
+ if(c < 0) {
+ break;
+ } else if(namechar((char)c)) {
+ buf.append((char)s.in.read());
+ } else {
+ break;
+ }
+ }
+ if(buf.length() == 0)
+ throw(new ParseException("Expected name, got `" + printable(s.in.peek()) + "'"));
+ return(buf.toString());
+ }
+
+ protected String entity(State s) throws IOException {
+ int c = s.in.read();
+ if(c != '&')
+ throw(new ParseException("Expected `&' while reading entity, got `" + printable(c) + "'"));
+ String nm = name(s);
+ c = s.in.read();
+ if(c != ';')
+ throw(new ParseException("Expected `;' while reading entity, got `" + printable(c) + "'"));
+ return(entity(nm));
+ }
+
+ protected Attr attribute(State s, Element el) throws IOException {
+ String nm = name(s);
+ s.in.peek(true);
+ int c = s.in.read();
+ if(c != '=')
+ throw(new ParseException("Expected `=' while reading attribute, got `" + printable(c) + "'"));
+ s.in.peek(true);
+ int qt = s.in.read();
+ if((qt != '"') && (qt != '\''))
+ throw(new ParseException("Expected double or single quote while reading attribute, got `" + printable(qt) + "'"));
+ StringBuilder buf = new StringBuilder();
+ while(true) {
+ c = s.in.peek();
+ if(c < 0) {
+ throw(new ParseException("Unexpected end-of-file while reading attribute value"));
+ } else if(c == qt) {
+ s.in.read();
+ break;
+ } else if(c == '&') {
+ buf.append(entity(s));
+ } else {
+ buf.append((char)s.in.read());
+ }
+ }
+ return(makeattr(s.doc, el, nm, buf.toString()));
+ }
+
+ protected Element element(State s) throws IOException {
+ Element n = makenode(s.doc, name(s));
+ while(true) {
+ int c = s.in.peek(true);
+ if(c < 0) {
+ throw(new ParseException("Unexpected end-of-file while parsing start tag"));
+ } else if(c == '>') {
+ s.in.read();
+ break;
+ } else if(c == '/') {
+ s.in.read();
+ s.in.peek(true);
+ c = s.in.read();
+ if(c != '>')
+ throw(new ParseException("Unexpected character `" + printable(c) + "' encountered in end of empty tag"));
+ return(n);
+ } else if(namechar((char)c)) {
+ n.setAttributeNodeNS(attribute(s, n));
+ } else {
+ throw(new ParseException("Unexpected character `" + printable(c) + "' encountered in start tag"));
+ }
+ }
+ while(true) {
+ int c = s.in.peek();
+ if(c < 0) {
+ break;
+ } else if(c == '<') {
+ s.in.read();
+ c = s.in.peek(true);
+ if(c == '/') {
+ s.in.read();
+ s.in.peek(true);
+ String nm = name(s);
+ if(!nm.equals(n.getTagName()))
+ throw(new ParseException("Unexpected end tag for `" + nm + "' while parsing `" + n.getTagName() + "'"));
+ if(s.in.peek(true) != '>')
+ throw(new ParseException("Expected `>' while reading end tag, got `" + printable(c) + "'"));
+ s.in.read();
+ break;
} else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name"));
+ n.appendChild(stag(s));
}
- } else if(st == "aent") {
- if(c == ';') {
- String ename = ebuf.toString();
- ebuf = new StringBuilder();
- String rep = entity(ename);
- if(rep == null)
- throw(new ParseException("Unknown entity `" + ename + "' encountered"));
- buf.append(rep);
- st = "aval";
- c = in.read();
- } else if(c < 0) {
- throw(new ParseException("Unexpected end-of-file while parsing entity name"));
- } else if(namechar((char)c)) {
- ebuf.append((char)c);
- c = in.read();
+ } else {
+ n.appendChild(text(s));
+ }
+ }
+ return(n);
+ }
+
+ protected Comment comment(State s) throws IOException {
+ if((s.in.read() != '!') ||
+ (s.in.read() != '-') ||
+ (s.in.read() != '-'))
+ throw(new ParseException("Illegal start of comment"));
+ StringBuilder buf = new StringBuilder();
+ while(true) {
+ int c = s.in.peek();
+ if(c < 0) {
+ throw(new ParseException("Unexpected end-of-file while parsing comment"));
+ } else if(c == '-') {
+ s.in.read();
+ if(s.in.peek() == '-') {
+ s.in.read();
+ if(s.in.peek() == '>') {
+ s.in.read();
+ break;
+ } else {
+ buf.append("--");
+ }
} else {
- throw(new ParseException("Unexpected character `" + printable((char)c) + "' encountered in entity name"));
+ buf.append("-");
}
} else {
- throw(new Error("BUG: Typoed state " + st));
+ buf.append((char)s.in.read());
}
}
+ return(s.doc.createComment(buf.toString()));
}
-
- private static String printable(char c) {
+
+ protected Node stag(State s) throws IOException {
+ int c = s.in.peek(true);
+ if(c < 0) {
+ throw(new ParseException("Unexpected end-of-file while parsing tag type"));
+ } else if(c == '!') {
+ return(comment(s));
+ } else {
+ return(element(s));
+ }
+ }
+
+ protected Text text(State s) throws IOException {
+ StringBuilder buf = new StringBuilder();
+ while(true) {
+ int c = s.in.peek();
+ if(c < 0) {
+ break;
+ } else if(c == '<') {
+ break;
+ } else if(c == '&') {
+ buf.append(entity(s));
+ } else {
+ buf.append((char)s.in.read());
+ }
+ }
+ return(s.doc.createTextNode(buf.toString()));
+ }
+
+ public DocumentFragment parse(Reader in) throws IOException {
+ State s = new State(in);
+ DocumentFragment frag = s.doc.createDocumentFragment();
+ while(true) {
+ int c = s.in.peek();
+ if(c < 0) {
+ return(frag);
+ } else if(c == '<') {
+ s.in.read();
+ frag.appendChild(stag(s));
+ } else {
+ frag.appendChild(text(s));
+ }
+ }
+ }
+
+ private static String printable(int c) {
+ if(c < 0)
+ return("EOF");
if(c < 32)
return(String.format("\\%03o", (int)c));
- return(Character.toString(c));
+ return(Character.toString((char)c));
}
public static void main(String[] args) throws Exception {