File: JTidyHTMLParser.java

package info (click to toggle)
httpunit 1.7%2Bdfsg-12
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 3,564 kB
  • ctags: 6,646
  • sloc: java: 33,665; xml: 482; sh: 68; makefile: 11
file content (100 lines) | stat: -rw-r--r-- 3,815 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package com.meterware.httpunit.parsing;
/********************************************************************************************************************
 * $Id: JTidyHTMLParser.java 855 2008-03-31 08:54:13Z wolfgang_fahl $
 *
 * Copyright (c) 2002,2004,2008 Russell Gold
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 * THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 *******************************************************************************************************************/
import org.w3c.tidy.Tidy;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.SAXException;

import java.net.URL;
import java.io.IOException;
import java.io.ByteArrayInputStream;
import java.io.UnsupportedEncodingException;

import com.meterware.httpunit.dom.HTMLDocumentImpl;


/**
 *
 * @author <a href="mailto:russgold@httpunit.org">Russell Gold</a>
 **/
class JTidyHTMLParser implements HTMLParser {


    public void parse( URL pageURL, String pageText, DocumentAdapter adapter ) throws IOException, SAXException {
        try {
            Document jtidyDocument = getParser( pageURL ).parseDOM( new ByteArrayInputStream( pageText.getBytes( UTF_ENCODING ) ), null );
            HTMLDocument htmlDocument = new HTMLDocumentImpl();
            NodeList nl = jtidyDocument.getChildNodes();
            for (int i = 0; i < nl.getLength(); i++) {
                Node importedNode = nl.item(i);
                if (importedNode.getNodeType() != Node.DOCUMENT_TYPE_NODE) htmlDocument.appendChild( htmlDocument.importNode( importedNode, true ) );
            }
            adapter.setDocument( htmlDocument );
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException( "UTF-8 encoding failed" );
        }
    }


    public String getCleanedText( String string ) {
        return (string == null) ? "" : string.replace( NBSP, ' ' );
    }


    public boolean supportsPreserveTagCase() {
        return false;
    }

    public boolean supportsForceTagCase() {
      return false;
    }
    
    public boolean supportsReturnHTMLDocument() {
        return true;
    }


    public boolean supportsParserWarnings() {
        return true;
    }


    final private static char NBSP = (char) 160;   // non-breaking space, defined by JTidy

    final private static String UTF_ENCODING = "UTF-8";


    private static Tidy getParser( URL url ) {
        Tidy tidy = new Tidy();
        tidy.setInputEncoding( "UTF8" );
        tidy.setOutputEncoding( "UTF8" );
        tidy.setQuiet( true );
        tidy.setShowWarnings( HTMLParserFactory.isParserWarningsEnabled() );
        if (!HTMLParserFactory.getHTMLParserListeners().isEmpty()) {
            tidy.setErrout( new JTidyPrintWriter( url ) );
        }
        return tidy;
    }

}