File: main.hs

package info (click to toggle)
haskell-html-conduit 1.3.2.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 104 kB
  • sloc: haskell: 580; makefile: 3
file content (193 lines) | stat: -rw-r--r-- 9,070 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
{-# LANGUAGE OverloadedStrings #-}
import Test.HUnit hiding (Test)
import Test.Hspec
import Test.Hspec.QuickCheck
import Data.ByteString.Lazy.Char8 ()
import qualified Text.HTML.DOM as H
import qualified Text.XML as X
import qualified Data.Map as Map
import qualified Data.Text as T
import Control.Exception (evaluate)
import Control.DeepSeq (($!!))
import Control.Monad (void)

main :: IO ()
main = hspec $ do
    describe "parses" $ do
        it "well-formed document" $
            X.parseLBS_ X.def "<foo><bar>baz</bar></foo>" @=?
            H.parseLBS        "<foo><bar>baz</bar></foo>"
        it "adds missing close tags" $
            X.parseLBS_ X.def "<foo><bar>baz</bar></foo>" @=?
            H.parseLBS        "<foo><bar>baz</foo>"
        it "void tags" $
            X.parseLBS_ X.def "<foo><bar><img/>foo</bar></foo>" @=?
            H.parseLBS        "<foo><bar><img>foo</foo>"
        it "xml entities" $
            X.parseLBS_ X.def "<foo><bar>baz&gt;</bar></foo>" @=?
            H.parseLBS        "<foo><bar>baz&gt;</foo>"
        it "html entities" $
            X.parseLBS_ X.def "<foo><bar>baz&#160;</bar></foo>" @=?
            H.parseLBS        "<foo><bar>baz&nbsp;</foo>"
        it "decimal entities" $
            X.parseLBS_ X.def "<foo><bar>baz&#160;</bar></foo>" @=?
            H.parseLBS        "<foo><bar>baz&#160;</foo>"
        it "hex entities" $
            X.parseLBS_ X.def "<foo><bar>baz&#x160;</bar></foo>" @=?
            H.parseLBS        "<foo><bar>baz&#x160;</foo>"
        it "invalid entities" $
            X.parseLBS_ X.def "<foo><bar>baz&amp;foobar;</bar></foo>" @=?
            H.parseLBS        "<foo><bar>baz&foobar;</foo>"
        it "multiple root elements" $
            X.parseLBS_ X.def "<html><foo><bar>baz&amp;foobar;</bar></foo><foo/></html>" @=?
            H.parseLBS        "<foo><bar>baz&foobar;</foo><foo>"
        it "doesn't strip whitespace" $
            X.parseLBS_ X.def "<foo>  hello</foo>" @=?
            H.parseLBS        "<foo>  hello</foo>"
        it "split code-points" $
            X.parseLBS_ X.def "<foo>&#xa0;</foo>" @=?
            H.parseBSChunks ["<foo>\xc2", "\xa0</foo>"]
        it "latin1 codes" $
            X.parseText_ X.def "<foo>\232</foo>" @=?
            H.parseSTChunks ["<foo>\232</foo>"]
        it "latin1 codes strict vs lazy" $
            H.parseLT "<foo>\232</foo>" @=?
            H.parseSTChunks ["<foo>\232</foo>"]
    describe "HTML parsing" $ do
        it "XHTML" $
            let html = "<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
                doc = X.Document (X.Prologue [] Nothing []) root []
                root = X.Element "html" Map.empty
                    [ X.NodeElement $ X.Element "head" Map.empty
                        [ X.NodeElement $ X.Element "title" Map.empty
                            [X.NodeContent "foo"]
                        ]
                    , X.NodeElement $ X.Element "body" Map.empty
                        [ X.NodeElement $ X.Element "p" Map.empty
                            [X.NodeContent "Hello World"]
                        ]
                    ]
             in H.parseLBS html @?= doc
        it "XHTML with doctype and <?xml #30" $ do
            let html = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
                doc = X.Document (X.Prologue [] Nothing []) root []
                root = X.Element "html" Map.empty
                    [ X.NodeElement $ X.Element "head" Map.empty
                        [ X.NodeElement $ X.Element "title" Map.empty
                            [X.NodeContent "foo"]
                        ]
                    , X.NodeElement $ X.Element "body" Map.empty
                        [ X.NodeElement $ X.Element "p" Map.empty
                            [X.NodeContent "Hello World"]
                        ]
                    ]
             in H.parseLBS html @?= doc
        it "HTML" $
            let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
                doc = X.Document (X.Prologue [] Nothing []) root []
                root = X.Element "html" Map.empty
                    [ X.NodeElement $ X.Element "head" Map.empty
                        [ X.NodeElement $ X.Element "title" Map.empty
                            [X.NodeContent "foo"]
                        ]
                    , X.NodeElement $ X.Element "body" Map.empty
                        [ X.NodeElement $ X.Element "br" Map.empty []
                        , X.NodeElement $ X.Element "p" Map.empty
                            [X.NodeContent "Hello World"]
                        ]
                    ]
             in H.parseLBS html @?= doc
        it "Mixed case br #167" $
            let html = "<html><head><title>foo</title></head><body><bR><p>Hello World</p><BR>done</body></html>"
                doc = X.Document (X.Prologue [] Nothing []) root []
                root = X.Element "html" Map.empty
                    [ X.NodeElement $ X.Element "head" Map.empty
                        [ X.NodeElement $ X.Element "title" Map.empty
                            [X.NodeContent "foo"]
                        ]
                    , X.NodeElement $ X.Element "body" Map.empty
                        [ X.NodeElement $ X.Element "bR" Map.empty []
                        , X.NodeElement $ X.Element "p" Map.empty
                            [X.NodeContent "Hello World"]
                        , X.NodeElement $ X.Element "BR" Map.empty []
                        , X.NodeContent "done"
                        ]
                    ]
             in H.parseLBS html @?= doc

    it "doesn't double unescape" $
        let html = "<p>Hello &amp;gt; World</p>"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element "p" Map.empty
                [ X.NodeContent "Hello &gt; World"
                ]
         in H.parseLBS html @?= doc

    it "handles entities in attributes" $
        let html = "<br title=\"Mac &amp; Cheese\">"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element "br" (Map.singleton "title" "Mac & Cheese") []
         in H.parseLBS html @?= doc

    it "doesn't double escape entities in attributes" $
        let html = "<br title=\"Mac &amp;amp; Cheese\">"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element "br" (Map.singleton "title" "Mac &amp; Cheese") []
         in H.parseLBS html @?= doc

    describe "script tags" $ do
      it "ignores funny characters" $
        let html = "<script>hello <> world</script>"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element "script" Map.empty [X.NodeContent "hello <> world"]
         in H.parseLBS html @?= doc

      {-

       Would be nice... doesn't work with tagstream-conduit original
       code. Not even sure if the HTML5 parser spec discusses this
       case.

      it "ignores </script> inside string" $
        let html = "<script>hello \"</script>\" world</script>"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element "script" Map.empty [X.NodeContent "hello \"</script>\" world"]
         in H.parseLBS html @?= doc

      -}

      it "unterminated" $
        let html = "<script>hello > world"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element "script" Map.empty [X.NodeContent "hello > world"]
         in H.parseLBS html @?= doc

      it "entities" $
        let html = "<script>hello &amp; world"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element "script" Map.empty [X.NodeContent "hello &amp; world"]
         in H.parseLBS html @?= doc

    prop "parses all random input" $ \strs -> void $ evaluate $!! H.parseSTChunks $ map T.pack strs

    describe "#128 entities cut off" $ do
      it "reported issue" $ do
        let html = "<a href=\"https://example.com?a=b&#038;c=d\">link</a>"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element
              "a"
              (Map.singleton "href" "https://example.com?a=b&c=d")
              [X.NodeContent "link"]
         in H.parseLBS html @?= doc

      it "from test suite" $ do
        let html = "<a class=\"u-url\" href=\"https://secure.gravatar.com/avatar/947b5f3f323da0ef785b6f02d9c265d6?s=96&#038;d=blank&#038;r=g\">link</a>"
            doc = X.Document (X.Prologue [] Nothing []) root []
            root = X.Element
              "a"
              (Map.fromList
                  [ ("href", "https://secure.gravatar.com/avatar/947b5f3f323da0ef785b6f02d9c265d6?s=96&d=blank&r=g")
                  , ("class", "u-url")
                  ])
              [X.NodeContent "link"]
         in H.parseLBS html @?= doc