1 /*
2 * Copyright 2001-2005 (C) MetaStuff, Ltd. All Rights Reserved.
3 *
4 * This software is open source.
5 * See the bottom of this file for the licence.
6 */
7 package org.dom4j.io;
8
9 import java.io.IOException;
10 import java.io.OutputStream;
11 import java.io.StringWriter;
12 import java.io.UnsupportedEncodingException;
13 import java.io.Writer;
14 import java.util.Collections;
15 import java.util.HashSet;
16 import java.util.LinkedList;
17 import java.util.Set;
18
19 import org.dom4j.Document;
20 import org.dom4j.DocumentHelper;
21 import org.dom4j.Element;
22 import org.dom4j.Entity;
23
24 import org.dom4j.NodeType;
25 import org.xml.sax.SAXException;
26
27 /**
28 * <p>
29 * <code>HTMLWriter</code> takes a DOM4J tree and formats it to a stream as
30 * HTML. This formatter is similar to XMLWriter but it outputs the text of CDATA
31 * and Entity sections rather than the serialised format as in XML, it has an
32 * XHTML mode, it retains whitespace in certain elements such as <PRE>,
33 * and it supports certain elements which have no corresponding close tag such
34 * as for <BR> and <P>.
35 * </p>
36 *
37 * <p>
38 * The OutputFormat passed in to the constructor is checked for isXHTML() and
39 * isExpandEmptyElements(). See {@link OutputFormat OutputFormat}for details.
40 * Here are the rules for <b>this class </b> based on an OutputFormat, "format",
41 * passed in to the constructor: <br/><br/>
42 *
43 * <ul>
44 * <li>If an element is in {@link #getOmitElementCloseSet()
45 * getOmitElementCloseSet}, then it is treated specially:
46 *
47 * <ul>
48 * <li>It never expands, since some browsers treat this as two separate
49 * Horizontal Rules: <HR></HR></li>
50 * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, then
51 * it has a space before the closing single-tag slash, since Netscape 4.x-
52 * treats this: <HR /> as an element named "HR" with an attribute named
53 * "/", but that's better than when it refuses to recognize this: <hr/>
54 * which it thinks is an element named "HR/".</li>
55 * </ul>
56 *
57 * </li>
58 * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, all
59 * elements must have either a close element, or be a closed single tag.</li>
60 * <li>If {@link org.dom4j.io.OutputFormat#isExpandEmptyElements()
61 * format.isExpandEmptyElements()}() is true, all elements are expanded except
62 * as above.</li>
63 * </ul>
64 *
65 * <b>Examples </b>
66 * </p>
67 *
68 * <p>
69 * </p>
70 *
71 * <p>
72 * If isXHTML == true, CDATA sections look like this:
73 *
74 * <PRE>
75 *
76 * <b><myelement><![CDATA[My data]]></myelement> </b>
77 *
78 * </PRE>
79 *
80 * Otherwise, they look like this:
81 *
82 * <PRE>
83 *
84 * <b><myelement>My data</myelement> </b>
85 *
86 * </PRE>
87 *
88 * </p>
89 *
90 * <p>
91 * Basically, {@link OutputFormat.isXHTML() OutputFormat.isXHTML()} ==
92 * <code>true</code> will produce valid XML, while {@link
93 * org.dom4j.io.OutputFormat#isExpandEmptyElements()
94 * format.isExpandEmptyElements()} determines whether empty elements are
95 * expanded if isXHTML is true, excepting the special HTML single tags.
96 * </p>
97 *
98 * <p>
99 * Also, HTMLWriter handles tags whose contents should be preformatted, that is,
100 * whitespace-preserved. By default, this set includes the tags <PRE>,
101 * <SCRIPT>, <STYLE>, and <TEXTAREA>, case insensitively. It
102 * does not include <IFRAME>. Other tags, such as <CODE>,
103 * <KBD>, <TT>, <VAR>, are usually rendered in a different
104 * font in most browsers, but don't preserve whitespace, so they also don't
105 * appear in the default list. HTML Comments are always whitespace-preserved.
106 * However, the parser you use may store comments with linefeed-only text nodes
107 * (\n) even if your platform uses another line.separator character, and
108 * HTMLWriter outputs Comment nodes exactly as the DOM is set up by the parser.
109 * See examples and discussion here: {@link#setPreformattedTags(java.util.Set)
110 * setPreformattedTags}
111 * </p>
112 *
113 * <p>
114 * <b>Examples </b>
115 * </p>
116 * <blockquote>
117 * <p>
118 * <b>Pretty Printing </b>
119 * </p>
120 *
121 * <p>
122 * This example shows how to pretty print a string containing a valid HTML
123 * document to a string. You can also just call the static methods of this
124 * class: <br>
125 * {@link #prettyPrintHTML(String) prettyPrintHTML(String)}or <br>
126 * {@link #prettyPrintHTML(String,boolean,boolean,boolean,boolean)
127 * prettyPrintHTML(String,boolean,boolean,boolean,boolean)} or, <br>
128 * {@link #prettyPrintXHTML(String) prettyPrintXHTML(String)}for XHTML (note
129 * the X)
130 * </p>
131 *
132 * <pre>
133 * String testPrettyPrint(String html) {
134 * StringWriter sw = new StringWriter();
135 * OutputFormat format = OutputFormat.createPrettyPrint();
136 * // These are the default values for createPrettyPrint,
137 * // so you needn't set them:
138 * // format.setNewlines(true);
139 * // format.setTrimText(true);</font>
140 * format.setXHTML(true);
141 * HTMLWriter writer = new HTMLWriter(sw, format);
142 * Document document = DocumentHelper.parseText(html);
143 * writer.write(document);
144 * writer.flush();
145 * return sw.toString();
146 * }
147 * </pre>
148 *
149 * <p>
150 * This example shows how to create a "squeezed" document, but one that will
151 * work in browsers even if the browser line length is limited. No newlines are
152 * included, no extra whitespace at all, except where it it required by
153 * {@link #setPreformattedTags(java.util.Set) setPreformattedTags}.
154 * </p>
155 *
156 * <pre>
157 * String testCrunch(String html) {
158 * StringWriter sw = new StringWriter();
159 * OutputFormat format = OutputFormat.createPrettyPrint();
160 * format.setNewlines(false);
161 * format.setTrimText(true);
162 * format.setIndent("");
163 * format.setXHTML(true);
164 * format.setExpandEmptyElements(false);
165 * format.setNewLineAfterNTags(20);
166 * org.dom4j.io.HTMLWriter writer = new HTMLWriter(sw, format);
167 * org.dom4j.Document document = DocumentHelper.parseText(html);
168 * writer.write(document);
169 * writer.flush();
170 * return sw.toString();
171 * }
172 * </pre>
173 *
174 * </blockquote>
175 *
176 * @author <a href="mailto:james.strachan@metastuff.com">James Strachan </a>
177 * @author Laramie Crocker
178 * @version $Revision: 1.21 $
179 */
180 public class HTMLWriter extends XMLWriter {
181
182 private static String lineSeparator = System.getProperty("line.separator");
183 protected static final HashSet<String> DEFAULT_PREFORMATTED_TAGS;
184
185
186 static {
187 // If you change this list, update the javadoc examples, above in the
188 // class javadoc, in writeElement, and in setPreformattedTags().
189 DEFAULT_PREFORMATTED_TAGS = new HashSet<String>();
190 DEFAULT_PREFORMATTED_TAGS.add("PRE");
191 DEFAULT_PREFORMATTED_TAGS.add("SCRIPT");
192 DEFAULT_PREFORMATTED_TAGS.add("STYLE");
193 DEFAULT_PREFORMATTED_TAGS.add("TEXTAREA");
194 }
195 protected static final OutputFormat DEFAULT_HTML_FORMAT;
196
197
198 static {
199 DEFAULT_HTML_FORMAT = new OutputFormat(" ", true);
200 DEFAULT_HTML_FORMAT.setTrimText(true);
201 DEFAULT_HTML_FORMAT.setSuppressDeclaration(true);
202 }
203 private LinkedList<FormatState> formatStack = new LinkedList<FormatState>();
204 private String lastText = "";
205 private int tagsOuput = 0; // legal values are 0+, but -1 signifies lazy initialization.
206 private int newLineAfterNTags = -1;
207 private HashSet<String> preformattedTags = DEFAULT_PREFORMATTED_TAGS;
208 /**
209 * Used to store the qualified element names which should have no close
210 * element tag
211 */
212 private HashSet<String> omitElementCloseSet;
213
214 public HTMLWriter(Writer writer) {
215 super(writer, DEFAULT_HTML_FORMAT);
216 }
217
218 public HTMLWriter(Writer writer, OutputFormat format) {
219 super(writer, format);
220 }
221
222 public HTMLWriter() throws UnsupportedEncodingException {
223 super(DEFAULT_HTML_FORMAT);
224 }
225
226 public HTMLWriter(OutputFormat format) throws UnsupportedEncodingException {
227 super(format);
228 }
229
230 public HTMLWriter(OutputStream out) throws UnsupportedEncodingException {
231 super(out, DEFAULT_HTML_FORMAT);
232 }
233
234 public HTMLWriter(OutputStream out, OutputFormat format)
235 throws UnsupportedEncodingException {
236 super(out, format);
237 }
238
239 @Override
240 public void startCDATA() throws SAXException {
241 }
242
243 @Override
244 public void endCDATA() throws SAXException {
245 }
246
247 // Overloaded methods
248 // added isXHTML() stuff so you get the CDATA brackets if you desire.
249 @Override
250 protected void writeCDATA(String text) throws IOException {
251 // XXX: Should we escape entities?
252 // writer.write( escapeElementEntities( text ) );
253 if (getOutputFormat().isXHTML()) {
254 super.writeCDATA(text);
255 } else {
256 writer.write(text);
257 }
258
259 lastOutputNodeType = NodeType.CDATA_SECTION_NODE;
260 }
261
262 @Override
263 protected void writeEntity(Entity entity) throws IOException {
264 writer.write(entity.getText());
265 lastOutputNodeType = NodeType.ENTITY_REFERENCE_NODE;
266 }
267
268 @Override
269 protected void writeDeclaration() throws IOException {
270 }
271
272 @Override
273 protected void writeString(String text) throws IOException {
274 /*
275 * DOM stores \n at the end of text nodes that are newlines. This is
276 * significant if we are in a PRE section. However, we only want to
277 * output the system line.separator, not \n. This is a little brittle,
278 * but this function appears to be called with these lineseparators as a
279 * separate TEXT_NODE. If we are in a preformatted section, output the
280 * right line.separator, otherwise ditch. If the single \n character is
281 * not the text, then do the super thing to output the text.
282 *
283 * Also, we store the last text that was not a \n since it may be used
284 * by writeElement in this class to line up preformatted tags.
285 */
286 if (text.equals("\n")) {
287 if (!formatStack.isEmpty()) {
288 super.writeString(lineSeparator);
289 }
290
291 return;
292 }
293
294 lastText = text;
295
296 if (formatStack.isEmpty()) {
297 super.writeString(text.trim());
298 } else {
299 super.writeString(text);
300 }
301 }
302
303 /**
304 * Overriden method to not close certain element names to avoid wierd
305 * behaviour from browsers for versions up to 5.x
306 *
307 * @param qualifiedName
308 * DOCUMENT ME!
309 *
310 * @throws IOException
311 * DOCUMENT ME!
312 */
313 @Override
314 protected void writeClose(String qualifiedName) throws IOException {
315 if (!omitElementClose(qualifiedName)) {
316 super.writeClose(qualifiedName);
317 }
318 }
319
320 @Override
321 protected void writeEmptyElementClose(String qualifiedName)
322 throws IOException {
323 if (getOutputFormat().isXHTML()) {
324 // xhtml, always check with format object whether to expand or not.
325 if (omitElementClose(qualifiedName)) {
326 // it was a special omit tag, do it the XHTML way: "<br/>",
327 // ignoring the expansion option, since <br></br> is OK XML,
328 // but produces twice the linefeeds desired in the browser.
329 // for netscape 4.7, though all are fine with it, write a space
330 // before the close slash.
331 writer.write(" />");
332 } else {
333 super.writeEmptyElementClose(qualifiedName);
334 }
335 } else {
336 // html, not xhtml
337 if (omitElementClose(qualifiedName)) {
338 // it was a special omit tag, do it the old html way: "<br>".
339 writer.write(">");
340 } else {
341 // it was NOT a special omit tag, check with format object
342 // whether to expand or not.
343 super.writeEmptyElementClose(qualifiedName);
344 }
345 }
346 }
347
348 protected boolean omitElementClose(String qualifiedName) {
349 return internalGetOmitElementCloseSet().contains(
350 qualifiedName.toUpperCase());
351 }
352
353 private HashSet internalGetOmitElementCloseSet() {
354 if (omitElementCloseSet == null) {
355 omitElementCloseSet = new HashSet<String>();
356 loadOmitElementCloseSet(omitElementCloseSet);
357 }
358
359 return omitElementCloseSet;
360 }
361
362 // If you change this, change the javadoc for getOmitElementCloseSet.
363 protected void loadOmitElementCloseSet(Set<String> set) {
364 set.add("AREA");
365 set.add("BASE");
366 set.add("BR");
367 set.add("COL");
368 set.add("HR");
369 set.add("IMG");
370 set.add("INPUT");
371 set.add("LINK");
372 set.add("META");
373 set.add("P");
374 set.add("PARAM");
375 }
376
377 // let the people see the set, but not modify it.
378 /**
379 * A clone of the Set of elements that can have their close-tags omitted. By
380 * default it should be "AREA", "BASE", "BR", "COL", "HR", "IMG", "INPUT",
381 * "LINK", "META", "P", "PARAM"
382 *
383 * @return A clone of the Set.
384 */
385 public Set getOmitElementCloseSet() {
386 return (Set) (internalGetOmitElementCloseSet().clone());
387 }
388
389 /**
390 * To use the empty set, pass an empty Set, or null:
391 *
392 * <pre>
393 *
394 *
395 * setOmitElementCloseSet(new HashSet());
396 * or
397 * setOmitElementCloseSet(null);
398 *
399 *
400 * </pre>
401 *
402 * @param newSet
403 * DOCUMENT ME!
404 */
405 public void setOmitElementCloseSet(Set<String> newSet) {
406 // resets, and safely empties it out if newSet is null.
407 omitElementCloseSet = new HashSet<String>();
408
409 if (newSet != null) {
410 omitElementCloseSet = new HashSet<String>();
411 for (String tag : newSet) {
412 if (tag != null) {
413 omitElementCloseSet.add(tag.toUpperCase());
414 }
415 }
416 }
417 }
418
419 /**
420 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
421 */
422 public Set<String> getPreformattedTags() {
423 return Collections.unmodifiableSet(preformattedTags);
424 }
425
426 /**
427 * <p>
428 * Override the default set, which includes PRE, SCRIPT, STYLE, and
429 * TEXTAREA, case insensitively.
430 * </p>
431 *
432 * <p>
433 * <b>Setting Preformatted Tags </b>
434 * </p>
435 *
436 * <p>
437 * Pass in a Set of Strings, one for each tag name that should be treated
438 * like a PRE tag. You may pass in null or an empty Set to assign the empty
439 * set, in which case no tags will be treated as preformatted, except that
440 * HTML Comments will continue to be preformatted. If a tag is included in
441 * the set of preformatted tags, all whitespace within the tag will be
442 * preserved, including whitespace on the same line preceding the close tag.
443 * This will generally make the close tag not line up with the start tag,
444 * but it preserves the intention of the whitespace within the tag.
445 * </p>
446 *
447 * <p>
448 * The browser considers leading whitespace before the close tag to be
449 * significant, but leading whitespace before the open tag to be
450 * insignificant. For example, if the HTML author doesn't put the close
451 * TEXTAREA tag flush to the left margin, then the TEXTAREA control in the
452 * browser will have spaces on the last line inside the control. This may be
453 * the HTML author's intent. Similarly, in a PRE, the browser treats a
454 * flushed left close PRE tag as different from a close tag with leading
455 * whitespace. Again, this must be left up to the HTML author.
456 * </p>
457 *
458 * <p>
459 * <b>Examples </b>
460 * </p>
461 * <blockquote>
462 * <p>
463 * Here is an example of how you can set the PreformattedTags list using
464 * setPreformattedTags to include IFRAME, as well as the default set, if you
465 * have an instance of this class named myHTMLWriter:
466 *
467 * <pre>
468 * Set current = myHTMLWriter.getPreformattedTags();
469 * current.add("IFRAME");
470 * myHTMLWriter.setPreformattedTags(current);
471 *
472 * //The set is now <b>PRE, SCRIPT, STYLE, TEXTAREA, IFRAME</b>
473 *
474 *
475 * </pre>
476 *
477 * Similarly, you can simply replace it with your own:
478 *
479 * <pre>
480 *
481 *
482 * HashSet newset = new HashSet();
483 * newset.add("PRE");
484 * newset.add("TEXTAREA");
485 * myHTMLWriter.setPreformattedTags(newset);
486 *
487 * //The set is now <b>{PRE, TEXTAREA}</b>
488 *
489 *
490 * </pre>
491 *
492 * You can remove all tags from the preformatted tags list, with an empty
493 * set, like this:
494 *
495 * <pre>
496 *
497 *
498 * myHTMLWriter.setPreformattedTags(new HashSet());
499 *
500 * //The set is now <b>{}</b>
501 *
502 *
503 * </pre>
504 *
505 * or with null, like this:
506 *
507 * <pre>
508 *
509 *
510 * myHTMLWriter.setPreformattedTags(null);
511 *
512 * //The set is now <b>{}</b>
513 *
514 *
515 * </pre>
516 *
517 * </p>
518 * </blockquote>
519 *
520 * @param newSet
521 * DOCUMENT ME!
522 */
523 public void setPreformattedTags(Set<String> newSet) {
524 // no fancy merging, just set it, assuming they did a
525 // getExcludeTrimTags() first if they wanted to preserve the default
526 // set.
527 // resets, and safely empties it out if newSet is null.
528 preformattedTags = new HashSet<String>();
529
530 if (newSet != null) {
531 for (String tag : newSet) {
532 if (tag != null) {
533 preformattedTags.add(tag.toUpperCase());
534 }
535 }
536 }
537 }
538
539 /**
540 * DOCUMENT ME!
541 *
542 * @param qualifiedName
543 * DOCUMENT ME!
544 *
545 * @return true if the qualifiedName passed in matched (case-insensitively)
546 * a tag in the preformattedTags set, or false if not found or if
547 * the set is empty or null.
548 *
549 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
550 */
551 public boolean isPreformattedTag(String qualifiedName) {
552 // A null set implies that the user called setPreformattedTags(null),
553 // which means they want no tags to be preformatted.
554 return (preformattedTags != null) && (preformattedTags.contains(qualifiedName.toUpperCase()));
555 }
556
557 /**
558 * This override handles any elements that should not remove whitespace,
559 * such as <PRE>, <SCRIPT>, <STYLE>, and <TEXTAREA>.
560 * Note: the close tags won't line up with the open tag, but we can't alter
561 * that. See javadoc note at setPreformattedTags.
562 *
563 * @param element
564 * DOCUMENT ME!
565 *
566 * @throws IOException
567 * When the stream could not be written to.
568 *
569 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
570 */
571 @Override
572 protected void writeElement(Element element) throws IOException {
573 if (newLineAfterNTags == -1) { // lazy initialization check
574 lazyInitNewLinesAfterNTags();
575 }
576
577 if (newLineAfterNTags > 0) {
578 if ((tagsOuput > 0) && ((tagsOuput % newLineAfterNTags) == 0)) {
579 super.writer.write(lineSeparator);
580 }
581 }
582
583 tagsOuput++;
584
585 String qualifiedName = element.getQualifiedName();
586 String saveLastText = lastText;
587 int size = element.nodeCount();
588
589 if (isPreformattedTag(qualifiedName)) {
590 OutputFormat currentFormat = getOutputFormat();
591 boolean saveNewlines = currentFormat.isNewlines();
592 boolean saveTrimText = currentFormat.isTrimText();
593 String currentIndent = currentFormat.getIndent();
594
595 // You could have nested PREs, or SCRIPTS within PRE... etc.,
596 // therefore use push and pop.
597 formatStack.addFirst(new FormatState(saveNewlines, saveTrimText,
598 currentIndent));
599
600 try {
601 // do this manually, since it won't be done while outputting
602 // the tag.
603 super.writePrintln();
604
605 if ((saveLastText.trim().length() == 0) && (currentIndent != null) && (currentIndent.length() > 0)) {
606 // We are indenting, but we want to line up with the close
607 // tag. lastText was the indent (whitespace, no \n) before
608 // the preformatted start tag. So write it out instead of
609 // the current indent level. This makes it line up with its
610 // close tag.
611 super.writer.write(justSpaces(saveLastText));
612 }
613
614 // actually, newlines are handled in this class by writeString,
615 // depending on if the stack is empty.
616 currentFormat.setNewlines(false);
617 currentFormat.setTrimText(false);
618 currentFormat.setIndent("");
619
620 // This line is the recursive one:
621 super.writeElement(element);
622 } finally {
623 FormatState state = formatStack.poll();
624 currentFormat.setNewlines(state.isNewlines());
625 currentFormat.setTrimText(state.isTrimText());
626 currentFormat.setIndent(state.getIndent());
627 }
628 } else {
629 super.writeElement(element);
630 }
631 }
632
633 private String justSpaces(String text) {
634 int size = text.length();
635 StringBuffer res = new StringBuffer(size);
636 char c;
637
638 for (int i = 0; i < size; i++) {
639 c = text.charAt(i);
640
641 switch (c) {
642 case '\r':
643 case '\n':
644
645 continue;
646
647 default:
648 res.append(c);
649 }
650 }
651
652 return res.toString();
653 }
654
655 private void lazyInitNewLinesAfterNTags() {
656 if (getOutputFormat().isNewlines()) {
657 // don't bother, newlines are going to happen anyway.
658 newLineAfterNTags = 0;
659 } else {
660 newLineAfterNTags = getOutputFormat().getNewLineAfterNTags();
661 }
662 }
663
664 // Convenience methods, static, with bunch-o-defaults
665 /**
666 * Convenience method to just get a String result.
667 *
668 * @param html
669 * DOCUMENT ME!
670 *
671 * @return a pretty printed String from the source string, preserving
672 * whitespace in the defaultPreformattedTags set, and leaving the
673 * close tags off of the default omitElementCloseSet set. Use one of
674 * the write methods if you want stream output.
675 *
676 * @throws java.io.IOException
677 * @throws java.io.UnsupportedEncodingException
678 * @throws org.dom4j.DocumentException
679 */
680 public static String prettyPrintHTML(String html)
681 throws java.io.IOException, java.io.UnsupportedEncodingException,
682 org.dom4j.DocumentException {
683 return prettyPrintHTML(html, true, true, false, true);
684 }
685
686 /**
687 * Convenience method to just get a String result, but <b>As XHTML </b>.
688 *
689 * @param html
690 * DOCUMENT ME!
691 *
692 * @return a pretty printed String from the source string, preserving
693 * whitespace in the defaultPreformattedTags set, but conforming to
694 * XHTML: no close tags are omitted (though if empty, they will be
695 * converted to XHTML empty tags: <HR/> Use one of the write
696 * methods if you want stream output.
697 *
698 * @throws java.io.IOException
699 * @throws java.io.UnsupportedEncodingException
700 * @throws org.dom4j.DocumentException
701 */
702 public static String prettyPrintXHTML(String html)
703 throws java.io.IOException, java.io.UnsupportedEncodingException,
704 org.dom4j.DocumentException {
705 return prettyPrintHTML(html, true, true, true, false);
706 }
707
708 /**
709 * DOCUMENT ME!
710 *
711 * @param html
712 * DOCUMENT ME!
713 * @param newlines
714 * DOCUMENT ME!
715 * @param trim
716 * DOCUMENT ME!
717 * @param isXHTML
718 * DOCUMENT ME!
719 * @param expandEmpty
720 * DOCUMENT ME!
721 *
722 * @return a pretty printed String from the source string, preserving
723 * whitespace in the defaultPreformattedTags set, and leaving the
724 * close tags off of the default omitElementCloseSet set. This
725 * override allows you to specify various formatter options. Use one
726 * of the write methods if you want stream output.
727 *
728 * @throws java.io.IOException
729 * @throws java.io.UnsupportedEncodingException
730 * @throws org.dom4j.DocumentException
731 */
732 public static String prettyPrintHTML(String html, boolean newlines,
733 boolean trim, boolean isXHTML, boolean expandEmpty)
734 throws java.io.IOException, java.io.UnsupportedEncodingException,
735 org.dom4j.DocumentException {
736 StringWriter sw = new StringWriter();
737 OutputFormat format = OutputFormat.createPrettyPrint();
738 format.setNewlines(newlines);
739 format.setTrimText(trim);
740 format.setXHTML(isXHTML);
741 format.setExpandEmptyElements(expandEmpty);
742
743 HTMLWriter writer = new HTMLWriter(sw, format);
744 Document document = DocumentHelper.parseText(html);
745 writer.write(document);
746 writer.flush();
747
748 return sw.toString();
749 }
750
751 // Allows us to the current state of the format in this struct on the
752 // formatStack.
753 private class FormatState {
754
755 private boolean newlines = false;
756 private boolean trimText = false;
757 private String indent = "";
758
759 public FormatState(boolean newLines, boolean trimText, String indent) {
760 this.newlines = newLines;
761 this.trimText = trimText;
762 this.indent = indent;
763 }
764
765 public boolean isNewlines() {
766 return newlines;
767 }
768
769 public boolean isTrimText() {
770 return trimText;
771 }
772
773 public String getIndent() {
774 return indent;
775 }
776 }
777 }
778
779 /*
780 * <html> <head> <title>My Title </title> <style> .foo { text-align: Right; }
781 * </style> <script> function mojo(){ return "bar"; } </script> <script
782 * language="JavaScript"> <!-- //this is the canonical javascript hiding.
783 * function foo(){ return "foo"; } //--> </script> </head> <!-- this is a
784 * comment --> <body bgcolor="#A4BFDD" mojo="&"> entities:   &
785 * " < > %23 <p></p> <mojo> </mojo> <foo /> <table border="1"> <tr>
786 * <td><pre> line0 <hr /> line1 <b>line2, should line up, indent-wise </b> line
787 * 3 line 4 </pre></td><td></td></tr> </table> <myCDATAElement> <![CDATA[My
788 * data]]> </myCDATAElement> </body> </html>
789 */
790
791 /*
792 * Redistribution and use of this software and associated documentation
793 * ("Software"), with or without modification, are permitted provided that the
794 * following conditions are met:
795 *
796 * 1. Redistributions of source code must retain copyright statements and
797 * notices. Redistributions must also contain a copy of this document.
798 *
799 * 2. Redistributions in binary form must reproduce the above copyright notice,
800 * this list of conditions and the following disclaimer in the documentation
801 * and/or other materials provided with the distribution.
802 *
803 * 3. The name "DOM4J" must not be used to endorse or promote products derived
804 * from this Software without prior written permission of MetaStuff, Ltd. For
805 * written permission, please contact dom4j-info@metastuff.com.
806 *
807 * 4. Products derived from this Software may not be called "DOM4J" nor may
808 * "DOM4J" appear in their names without prior written permission of MetaStuff,
809 * Ltd. DOM4J is a registered trademark of MetaStuff, Ltd.
810 *
811 * 5. Due credit should be given to the DOM4J Project - http://www.dom4j.org
812 *
813 * THIS SOFTWARE IS PROVIDED BY METASTUFF, LTD. AND CONTRIBUTORS ``AS IS'' AND
814 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
815 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
816 * ARE DISCLAIMED. IN NO EVENT SHALL METASTUFF, LTD. OR ITS CONTRIBUTORS BE
817 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
818 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
819 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
820 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
821 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
822 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
823 * POSSIBILITY OF SUCH DAMAGE.
824 *
825 * Copyright 2001-2005 (C) MetaStuff, Ltd. All Rights Reserved.
826 */